int main(void) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int Mdim, Ndim, Pdim; // A[N][P], B[P][M], C[N][M] int szA, szB, szC; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel Ndim = ORDER; Pdim = ORDER; Mdim = ORDER; szA = Ndim * Pdim; szB = Pdim * Mdim; szC = Ndim * Mdim; h_A = (float *)malloc(szA * sizeof(float)); h_B = (float *)malloc(szB * sizeof(float)); h_C = (float *)malloc(szC * sizeof(float)); initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); start_time = wtime(); seq_mat_mul_sdot(Mdim, Ndim, Pdim, h_A, h_B, h_C); run_time = wtime() - start_time; results(Mdim, Ndim, Pdim, h_C, run_time); } //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(Mdim, Ndim, Pdim, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * szA, h_A, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * szB, h_B, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * szC, NULL, &err); if (err != CL_SUCCESS) { printf("Error: failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {Ndim, Mdim}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = Ndim; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (err != CL_SUCCESS) { printf("Error: could not create program\n%s\n", err_code(err)); return EXIT_FAILURE; } free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); if (!kernel || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",Ndim); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(Ndim, Mdim, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &Mdim); err |= clSetKernelArg(kernel, 1, sizeof(int), &Ndim); err |= clSetKernelArg(kernel, 2, sizeof(int), &Pdim); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 4, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 5, sizeof(cl_mem), &d_c); if (err != CL_SUCCESS) { printf("Error: Could not set kernel arguments\n"); return EXIT_FAILURE; } start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = Ndim; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clFinish(commands); if (err != CL_SUCCESS) { printf("Error: waiting for queue to finish failed\n%s\n", err_code(err)); return EXIT_FAILURE; } run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * szC, h_C, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } results(Mdim, Ndim, Pdim, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(int argc, char** argv) { if (argc != 2) { printf("Usage: ./pi_vocl num\n"); printf("\twhere num = 1, 4 or 8\n"); return EXIT_FAILURE; } int vector_size = atoi(argv[1]); // Define some vector size specific constants unsigned int ITERS, WGS; if (vector_size == 1) { ITERS = 262144; WGS = 8; } else if (vector_size == 4) { ITERS = 262144 / 4; WGS = 32; } else if (vector_size == 8) { ITERS = 262144 / 8; WGS = 64; } else { fprintf(stderr, "Invalid vector size\n"); return EXIT_FAILURE; } // Set some default values: // Default number of steps (updated later to device preferable) unsigned int in_nsteps = INSTEPS; // Defaultl number of iterations unsigned int niters = ITERS; unsigned int work_group_size = WGS; // Create context, queue and build program cl_int err; cl_context context; cl_device_id device; cl_command_queue queue; cl_program program; cl_kernel kernel; // Find number of platforms cl_uint numPlatforms; err = clGetPlatformIDs(0, NULL, &numPlatforms); checkError(err, "Finding platforms"); // Get all platforms cl_platform_id platforms[numPlatforms]; err = clGetPlatformIDs(numPlatforms, platforms, NULL); checkError(err, "Getting platforms"); // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(platforms[i], DEVICE, 1, &device, NULL); if (err == CL_SUCCESS) break; } if (device == NULL) checkError(err, "Getting a device"); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue queue = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); // Create the compute program from the source buffer char *kernel_source = getKernelSource("../pi_vocl.cl"); program = clCreateProgramWithSource(context, 1, (const char**)&kernel_source, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); checkError(err, "Building program"); } if (vector_size == 1) { kernel = clCreateKernel(program, "pi", &err); checkError(err, "Creating kernel pi"); } else if (vector_size == 4) { kernel = clCreateKernel(program, "pi_vec4", &err); checkError(err, "Creating kernel pi_vec4"); } else if (vector_size == 8) { kernel = clCreateKernel(program, "pi_vec8", &err); checkError(err, "Creating kernel pi_vec8"); } // Now that we know the size of the work_groups, we can set the number of work // groups, the actual number of steps, and the step size unsigned int nwork_groups = in_nsteps/(work_group_size*niters); // Get the max work group size for the kernel pi on our device size_t max_size; err = clGetKernelWorkGroupInfo(kernel, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(max_size), &max_size, NULL); checkError(err, "Getting kernel work group size"); if (max_size > work_group_size) { work_group_size = max_size; nwork_groups = in_nsteps/(nwork_groups*niters); } if (nwork_groups < 1) { err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(nwork_groups), &nwork_groups, NULL); checkError(err, "Getting device max compute units"); work_group_size = in_nsteps/(nwork_groups*niters); } unsigned int nsteps = work_group_size * niters * nwork_groups; float step_size = 1.0f / (float) nsteps; // Array to hold partial sum float *h_psum = (float*)calloc(nwork_groups, sizeof(float)); printf("%d work groups of size %d.\n", nwork_groups, work_group_size); printf(" %u Integration steps\n", nsteps); cl_mem d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); checkError(err, "Creating buffer d_partial_sums"); // Execute the kernel over the entire range of our 1d input data et // using the maximum number of work group items for this device const size_t global = nwork_groups * work_group_size; const size_t local = work_group_size; err = clSetKernelArg(kernel, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_partial_sums); checkError(err, "Setting kernel args"); // Start the timer double rtime = wtime(); err = clEnqueueNDRangeKernel( queue, kernel, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clEnqueueReadBuffer(queue, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); checkError(err, "Reading back d_partial_sums"); // complete the sum and compute the final integral value on the host float pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %u steps\n", pi_res, nsteps); free(h_psum); free(kernel_source); }
int main(int argc, char *argv[]) { float *h_A; // A matrix float *h_B; // B matrix float *h_C; // C = A*B matrix int N; // A[N][N], B[N][N], C[N][N] int size; // number of elements in each matrix cl_mem d_a, d_b, d_c; // Matrices in device memory double start_time; // Starting time double run_time; // timing data char * kernelsource; // kernel source string cl_int err; // error code returned from OpenCL calls cl_device_id device; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel; // compute kernel N = ORDER; size = N * N; h_A = (float *)malloc(size * sizeof(float)); h_B = (float *)malloc(size * sizeof(float)); h_C = (float *)malloc(size * sizeof(float)); //-------------------------------------------------------------------------------- // Create a context, queue and device. //-------------------------------------------------------------------------------- cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices cl_device_id devices[MAX_DEVICES]; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { printf("Invalid device index (try '--list')\n"); return EXIT_FAILURE; } device = devices[deviceIndex]; char name[MAX_INFO_STRING]; getDeviceName(device, name); printf("\nUsing OpenCL device: %s\n", name); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); //-------------------------------------------------------------------------------- // Run sequential version on the host //-------------------------------------------------------------------------------- initmat(N, h_A, h_B, h_C); printf("\n===== Sequential, matrix mult (dot prod), order %d on host CPU ======\n",ORDER); for(int i = 0; i < COUNT; i++) { zero_mat(N, h_C); start_time = wtime(); seq_mat_mul_sdot(N, h_A, h_B, h_C); run_time = wtime() - start_time; results(N, h_C, run_time); } //-------------------------------------------------------------------------------- // Setup the buffers, initialize matrices, and write them into global memory //-------------------------------------------------------------------------------- // Reset A, B and C matrices (just to play it safe) initmat(N, h_A, h_B, h_C); d_a = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_A, &err); checkError(err, "Creating buffer d_a"); d_b = clCreateBuffer(context, CL_MEM_READ_ONLY | CL_MEM_COPY_HOST_PTR, sizeof(float) * size, h_B, &err); checkError(err, "Creating buffer d_b"); d_c = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * size, NULL, &err); checkError(err, "Creating buffer d_c"); //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... Naive //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_elem.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_elem.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_elem.cl"); printf("\n===== OpenCL, matrix mult, C(i,j) per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the entire range of C matrix elements ... computing // a dot product for each element of the product matrix. The local work // group size is set to NULL ... so I'm telling the OpenCL runtime to // figure out a local work group size for me. const size_t global[2] = {N, N}; err = clEnqueueNDRangeKernel( commands, kernel, 2, NULL, global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Copying back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program with C_row.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row.cl"); printf("\n===== OpenCL, matrix mult, C row per work item, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, NULL, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // OpenCL matrix multiplication ... C row per work item, A row in pivate memory //-------------------------------------------------------------------------------- kernelsource = getKernelSource("../C_row_priv.cl"); // Create the comput program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program from C_row_priv.cl"); free(kernelsource); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel = clCreateKernel(program, "mmul", &err); checkError(err, "Creating kernel from C_row_priv.cl"); printf("\n===== OpenCL, matrix mult, C row, A row in priv mem, order %d ======\n",N); // Do the multiplication COUNT times for (int i = 0; i < COUNT; i++) { zero_mat(N, h_C); err = clSetKernelArg(kernel, 0, sizeof(int), &N); err |= clSetKernelArg(kernel, 1, sizeof(cl_mem), &d_a); err |= clSetKernelArg(kernel, 2, sizeof(cl_mem), &d_b); err |= clSetKernelArg(kernel, 3, sizeof(cl_mem), &d_c); checkError(err, "Setting kernel args"); start_time = wtime(); // Execute the kernel over the rows of the C matrix ... computing // a dot product for each element of the product matrix. const size_t global = N; const size_t local = ORDER / 16; err = clEnqueueNDRangeKernel( commands, kernel, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clFinish(commands); checkError(err, "Waiting for kernel to finish"); run_time = wtime() - start_time; err = clEnqueueReadBuffer( commands, d_c, CL_TRUE, 0, sizeof(float) * size, h_C, 0, NULL, NULL); checkError(err, "Reading back d_c"); results(N, h_C, run_time); } // end for loop //-------------------------------------------------------------------------------- // Clean up! //-------------------------------------------------------------------------------- free(h_A); free(h_B); free(h_C); clReleaseMemObject(d_a); clReleaseMemObject(d_b); clReleaseMemObject(d_c); clReleaseProgram(program); clReleaseKernel(kernel); clReleaseCommandQueue(commands); clReleaseContext(context); return EXIT_SUCCESS; }
int main(int argc, char *argv[]) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context, queue, kernel, etc. cl_uint deviceIndex = 0; parseArguments(argc, argv, &deviceIndex); // Get list of devices cl_device_id devices[MAX_DEVICES]; unsigned numDevices = getDeviceList(devices); // Check device index in range if (deviceIndex >= numDevices) { printf("Invalid device index (try '--list')\n"); return EXIT_FAILURE; } device = devices[deviceIndex]; char name[MAX_INFO_STRING]; getDeviceName(device, name); printf("\nUsing OpenCL device: %s\n", name); // Create a compute context context = clCreateContext(0, 1, &device, NULL, NULL, &err); checkError(err, "Creating context"); // Create a command queue commands = clCreateCommandQueue(context, device, 0, &err); checkError(err, "Creating command queue"); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); checkError(err, "Creating program"); // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); checkError(err, "Creating kernel"); // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); checkError(err, "Getting kernel work group info"); // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); checkError(err, "Getting device compute unit info"); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; printf("nsteps:%d\n", nsteps); printf("niters:%d\n", niters); printf("work_group_size:%zd\n", work_group_size); printf("n work groups:%ld\n", nwork_groups); printf("step_size:%f\n", step_size); h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); checkError(err, "Creating buffer d_partial_sums"); // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); checkError(err, "Settin kernel args"); // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nsteps / niters; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); checkError(err, "Enqueueing kernel"); err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); checkError(err, "Reading back d_partial_sums"); // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
int main(void) { float *h_psum; // vector to hold partial sum int in_nsteps = INSTEPS; // default number of steps (updated later to device preferable) int niters = ITERS; // number of iterations int nsteps; float step_size; size_t nwork_groups; size_t max_size, work_group_size = 8; float pi_res; cl_mem d_partial_sums; char *kernelsource = getKernelSource("../pi_ocl.cl"); // Kernel source cl_int err; cl_device_id device_id; // compute device id cl_context context; // compute context cl_command_queue commands; // compute command queue cl_program program; // compute program cl_kernel kernel_pi; // compute kernel // Set up OpenCL context. queue, kernel, etc. cl_uint numPlatforms; // Find number of platforms err = clGetPlatformIDs(0, NULL, &numPlatforms); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to find a platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Get all platforms cl_platform_id Platform[numPlatforms]; err = clGetPlatformIDs(numPlatforms, Platform, NULL); if (err != CL_SUCCESS || numPlatforms <= 0) { printf("Error: Failed to get the platform!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Secure a device for (int i = 0; i < numPlatforms; i++) { err = clGetDeviceIDs(Platform[i], DEVICE, 1, &device_id, NULL); if (err == CL_SUCCESS) break; } if (device_id == NULL) { printf("Error: Failed to create a device group!\n%s\n",err_code(err)); return EXIT_FAILURE; } // Output information err = output_device_info(device_id); // Create a compute context context = clCreateContext(0, 1, &device_id, NULL, NULL, &err); if (!context) { printf("Error: Failed to create a compute context!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create a command queue commands = clCreateCommandQueue(context, device_id, 0, &err); if (!commands) { printf("Error: Failed to create a command commands!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) & kernelsource, NULL, &err); if (!program) { printf("Error: Failed to create compute program!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Build the program err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err != CL_SUCCESS) { size_t len; char buffer[2048]; printf("Error: Failed to build program executable!\n%s\n", err_code(err)); clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, &len); printf("%s\n", buffer); return EXIT_FAILURE; } // Create the compute kernel from the program kernel_pi = clCreateKernel(program, "pi", &err); if (!kernel_pi || err != CL_SUCCESS) { printf("Error: Failed to create compute kernel!\n%s\n", err_code(err)); return EXIT_FAILURE; } // Find kernel work-group size err = clGetKernelWorkGroupInfo (kernel_pi, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), &work_group_size, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to get kernel work-group info\n%s\n", err_code(err)); return EXIT_FAILURE; } // Now that we know the size of the work-groups, we can set the number of // work-groups, the actual number of steps, and the step size nwork_groups = in_nsteps/(work_group_size*niters); if (nwork_groups < 1) { err = clGetDeviceInfo(device_id, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(size_t), &nwork_groups, NULL); work_group_size = in_nsteps / (nwork_groups * niters); } nsteps = work_group_size * niters * nwork_groups; step_size = 1.0f/(float)nsteps; h_psum = calloc(sizeof(float), nwork_groups); if (!h_psum) { printf("Error: could not allocate host memory for h_psum\n"); return EXIT_FAILURE; } printf(" %ld work-groups of size %ld. %d Integration steps\n", nwork_groups, work_group_size, nsteps); d_partial_sums = clCreateBuffer(context, CL_MEM_WRITE_ONLY, sizeof(float) * nwork_groups, NULL, &err); if (err != CL_SUCCESS) { printf("Error: Failed to create buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // Set kernel arguments err = clSetKernelArg(kernel_pi, 0, sizeof(int), &niters); err |= clSetKernelArg(kernel_pi, 1, sizeof(float), &step_size); err |= clSetKernelArg(kernel_pi, 2, sizeof(float) * work_group_size, NULL); err |= clSetKernelArg(kernel_pi, 3, sizeof(cl_mem), &d_partial_sums); if (err != CL_SUCCESS) { printf("Error: Failed to set kernel arguments!\n"); return EXIT_FAILURE; } // Execute the kernel over the entire range of our 1D input data set // using the maximum number of work items for this device size_t global = nwork_groups * work_group_size; size_t local = work_group_size; double rtime = wtime(); err = clEnqueueNDRangeKernel( commands, kernel_pi, 1, NULL, &global, &local, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to execute kernel\n%s\n", err_code(err)); return EXIT_FAILURE; } err = clEnqueueReadBuffer( commands, d_partial_sums, CL_TRUE, 0, sizeof(float) * nwork_groups, h_psum, 0, NULL, NULL); if (err != CL_SUCCESS) { printf("Error: Failed to read buffer\n%s\n", err_code(err)); return EXIT_FAILURE; } // complete the sum and compute the final integral value on the host pi_res = 0.0f; for (unsigned int i = 0; i < nwork_groups; i++) { pi_res += h_psum[i]; } pi_res *= step_size; rtime = wtime() - rtime; printf("\nThe calculation ran in %lf seconds\n", rtime); printf(" pi = %f for %d steps\n", pi_res, nsteps); // clean up clReleaseMemObject(d_partial_sums); clReleaseProgram(program); clReleaseKernel(kernel_pi); clReleaseCommandQueue(commands); clReleaseContext(context); free(kernelsource); free(h_psum); }
int main(int argc, char** argv) { ocd_init(&argc, &argv, NULL); ocd_initCL(); cl_int err; size_t global_size; size_t local_size; cl_program program; cl_kernel kernel_compute_flux; cl_kernel kernel_compute_flux_contributions; cl_kernel kernel_compute_step_factor; cl_kernel kernel_time_step; cl_kernel kernel_initialize_variables; cl_mem ff_variable; cl_mem ff_fc_momentum_x; cl_mem ff_fc_momentum_y; cl_mem ff_fc_momentum_z; cl_mem ff_fc_density_energy; if (argc < 2) { printf("Usage ./cfd <data input file>\n"); return 0; } const char* data_file_name = argv[1]; // set far field conditions and load them into constant memory on the gpu { float h_ff_variable[NVAR]; const float angle_of_attack = (float)(3.1415926535897931 / 180.0) * (float)(deg_angle_of_attack); h_ff_variable[VAR_DENSITY] = (float)(1.4); float ff_pressure = (float)(1.0); float ff_speed_of_sound = sqrt(GAMMA*ff_pressure / h_ff_variable[VAR_DENSITY]); float ff_speed = (float)(ff_mach)*ff_speed_of_sound; float3 ff_velocity; ff_velocity.x = ff_speed*(float)(cos((float)angle_of_attack)); ff_velocity.y = ff_speed*(float)(sin((float)angle_of_attack)); ff_velocity.z = 0.0; h_ff_variable[VAR_MOMENTUM+0] = h_ff_variable[VAR_DENSITY] * ff_velocity.x; h_ff_variable[VAR_MOMENTUM+1] = h_ff_variable[VAR_DENSITY] * ff_velocity.y; h_ff_variable[VAR_MOMENTUM+2] = h_ff_variable[VAR_DENSITY] * ff_velocity.z; h_ff_variable[VAR_DENSITY_ENERGY] = h_ff_variable[VAR_DENSITY]*((float)(0.5)*(ff_speed*ff_speed)) + (ff_pressure / (float)(GAMMA-1.0)); float3 h_ff_momentum; h_ff_momentum.x = *(h_ff_variable+VAR_MOMENTUM+0); h_ff_momentum.y = *(h_ff_variable+VAR_MOMENTUM+1); h_ff_momentum.z = *(h_ff_variable+VAR_MOMENTUM+2); float3 h_ff_fc_momentum_x; float3 h_ff_fc_momentum_y; float3 h_ff_fc_momentum_z; float3 h_ff_fc_density_energy; compute_flux_contribution(&h_ff_variable[VAR_DENSITY], &h_ff_momentum, &h_ff_variable[VAR_DENSITY_ENERGY], ff_pressure, &ff_velocity, &h_ff_fc_momentum_x, &h_ff_fc_momentum_y, &h_ff_fc_momentum_z, &h_ff_fc_density_energy); // copy far field conditions to the gpu ff_variable = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * NVAR, h_ff_variable, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_x = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_x, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_y = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_y, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_momentum_z = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_momentum_z, &err); CHKERR(err, "Unable to allocate ff data"); ff_fc_density_energy = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float3), &h_ff_fc_density_energy, &err); CHKERR(err, "Unable to allocate ff data"); } int nel; int nelr; // read in domain geometry cl_mem areas; cl_mem elements_surrounding_elements; cl_mem normals; { std::ifstream file(data_file_name); file >> nel; nelr = block_length*((nel / block_length )+ std::min(1, nel % block_length)); float* h_areas = new float[nelr]; int* h_elements_surrounding_elements = new int[nelr*NNB]; float* h_normals = new float[nelr*NDIM*NNB]; // read in data for(int i = 0; i < nel; i++) { file >> h_areas[i]; for(int j = 0; j < NNB; j++) { file >> h_elements_surrounding_elements[i + j*nelr]; if(h_elements_surrounding_elements[i+j*nelr] < 0) h_elements_surrounding_elements[i+j*nelr] = -1; h_elements_surrounding_elements[i + j*nelr]--; //it's coming in with Fortran numbering for(int k = 0; k < NDIM; k++) { file >> h_normals[i + (j + k*NNB)*nelr]; h_normals[i + (j + k*NNB)*nelr] = -h_normals[i + (j + k*NNB)*nelr]; } } } // fill in remaining data int last = nel-1; for(int i = nel; i < nelr; i++) { h_areas[i] = h_areas[last]; for(int j = 0; j < NNB; j++) { // duplicate the last element h_elements_surrounding_elements[i + j*nelr] = h_elements_surrounding_elements[last + j*nelr]; for(int k = 0; k < NDIM; k++) h_normals[last + (j + k*NNB)*nelr] = h_normals[last + (j + k*NNB)*nelr]; } } areas = alloc<float>(context, nelr); upload<float>(commands, areas, h_areas, nelr); elements_surrounding_elements = alloc<int>(context, nelr*NNB); upload<int>(commands, elements_surrounding_elements, h_elements_surrounding_elements, nelr*NNB); normals = alloc<float>(context, nelr*NDIM*NNB); upload<float>(commands, normals, h_normals, nelr*NDIM*NNB); delete[] h_areas; delete[] h_elements_surrounding_elements; delete[] h_normals; } // Get program source. long kernelSize = getKernelSize(); char* kernelSource = new char[kernelSize]; getKernelSource(kernelSource, kernelSize); // Create the compute program from the source buffer program = clCreateProgramWithSource(context, 1, (const char **) &kernelSource, NULL, &err); CHKERR(err, "Failed to create a compute program!"); // Build the program executable err = clBuildProgram(program, 0, NULL, NULL, NULL, NULL); if (err == CL_BUILD_PROGRAM_FAILURE) { char *log; size_t logLen; err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, 0, NULL, &logLen); log = (char *) malloc(sizeof(char)*logLen); err = clGetProgramBuildInfo(program, device_id, CL_PROGRAM_BUILD_LOG, logLen, (void *) log, NULL); fprintf(stderr, "CL Error %d: Failed to build program! Log:\n%s", err, log); free(log); exit(1); } CHKERR(err, "Failed to build program!"); delete[] kernelSource; // Create the compute kernel in the program we wish to run kernel_compute_flux = clCreateKernel(program, "compute_flux", &err); CHKERR(err, "Failed to create a compute kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_flux_contributions = clCreateKernel(program, "compute_flux_contributions", &err); CHKERR(err, "Failed to create a compute_flux_contributions kernel!"); // Create the reduce kernel in the program we wish to run kernel_compute_step_factor = clCreateKernel(program, "compute_step_factor", &err); CHKERR(err, "Failed to create a compute_step_factor kernel!"); // Create the reduce kernel in the program we wish to run kernel_time_step = clCreateKernel(program, "time_step", &err); CHKERR(err, "Failed to create a time_step kernel!"); // Create the reduce kernel in the program we wish to run kernel_initialize_variables = clCreateKernel(program, "initialize_variables", &err); CHKERR(err, "Failed to create a initialize_variables kernel!"); // Create arrays and set initial conditions cl_mem variables = alloc<cl_float>(context, nelr*NVAR); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device //err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); local_size = 1;//std::min(local_size, (size_t)nelr); global_size = nelr; err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); err = clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 0"); cl_mem old_variables = alloc<float>(context, nelr*NVAR); cl_mem fluxes = alloc<float>(context, nelr*NVAR); cl_mem step_factors = alloc<float>(context, nelr); clFinish(commands); cl_mem fc_momentum_x = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_y = alloc<float>(context, nelr*NDIM); cl_mem fc_momentum_z = alloc<float>(context, nelr*NDIM); cl_mem fc_density_energy = alloc<float>(context, nelr*NDIM); clFinish(commands); // make sure all memory is floatly allocated before we start timing err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&old_variables); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_initialize_variables, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_initialize_variables work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 1"); err = 0; err = clSetKernelArg(kernel_initialize_variables, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_initialize_variables, 1, sizeof(cl_mem),&fluxes); err |= clSetKernelArg(kernel_initialize_variables, 2, sizeof(cl_mem),&ff_variable); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_initialize_variables, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Init Kernels", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_initialize_variables]! 2"); std::cout << "About to memcopy" << std::endl; err = clReleaseMemObject(step_factors); float temp[nelr]; for(int i = 0; i < nelr; i++) temp[i] = 0; step_factors = clCreateBuffer(context, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(float) * nelr, temp, &err); CHKERR(err, "Unable to memset step_factors"); // make sure CUDA isn't still doing something before we start timing clFinish(commands); // these need to be computed the first time in order to compute time step std::cout << "Starting..." << std::endl; // Begin iterations for(int i = 0; i < iterations; i++) { copy<float>(commands, old_variables, variables, nelr*NVAR); // for the first iteration we compute the time step err = 0; err = clSetKernelArg(kernel_compute_step_factor, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_step_factor, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_step_factor, 2, sizeof(cl_mem), &areas); err |= clSetKernelArg(kernel_compute_step_factor, 3, sizeof(cl_mem), &step_factors); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_step_factor, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_step_factor work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_step_factor, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Step Factor Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel[kernel_compute_step_factor]!"); for(int j = 0; j < RK; j++) { err = 0; err = clSetKernelArg(kernel_compute_flux_contributions, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux_contributions, 1, sizeof(cl_mem),&variables); err |= clSetKernelArg(kernel_compute_flux_contributions, 2, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux_contributions, 3, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux_contributions, 4, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux_contributions, 5, sizeof(cl_mem), &fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux_contributions, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux_contributions work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux_contributions, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Contribution Kernel", ocdTempTimer) //compute_flux_contributions(nelr, variables, fc_momentum_x, fc_momentum_y, fc_momentum_z, fc_density_energy); END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux_contributions]!"); err = 0; err = clSetKernelArg(kernel_compute_flux, 0, sizeof(int), &nelr); err |= clSetKernelArg(kernel_compute_flux, 1, sizeof(cl_mem), &elements_surrounding_elements); err |= clSetKernelArg(kernel_compute_flux, 2, sizeof(cl_mem), &normals); err |= clSetKernelArg(kernel_compute_flux, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_compute_flux, 4, sizeof(cl_mem), &fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 5, sizeof(cl_mem), &fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 6, sizeof(cl_mem), &fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 7, sizeof(cl_mem), &fc_density_energy); err |= clSetKernelArg(kernel_compute_flux, 8, sizeof(cl_mem), &fluxes); err |= clSetKernelArg(kernel_compute_flux, 9, sizeof(cl_mem), &ff_variable); err |= clSetKernelArg(kernel_compute_flux, 10, sizeof(cl_mem), &ff_fc_momentum_x); err |= clSetKernelArg(kernel_compute_flux, 11, sizeof(cl_mem), &ff_fc_momentum_y); err |= clSetKernelArg(kernel_compute_flux, 12, sizeof(cl_mem), &ff_fc_momentum_z); err |= clSetKernelArg(kernel_compute_flux, 13, sizeof(cl_mem), &ff_fc_density_energy); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_compute_flux, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_compute_flux work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_compute_flux, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Flux Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_compute_flux]!"); err = 0; err = clSetKernelArg(kernel_time_step, 0, sizeof(int), &j); err |= clSetKernelArg(kernel_time_step, 1, sizeof(int), &nelr); err |= clSetKernelArg(kernel_time_step, 2, sizeof(cl_mem), &old_variables); err |= clSetKernelArg(kernel_time_step, 3, sizeof(cl_mem), &variables); err |= clSetKernelArg(kernel_time_step, 4, sizeof(cl_mem), &step_factors); err |= clSetKernelArg(kernel_time_step, 5, sizeof(cl_mem), &fluxes); CHKERR(err, "Failed to set kernel arguments!"); // Get the maximum work group size for executing the kernel on the device err = clGetKernelWorkGroupInfo(kernel_time_step, device_id, CL_KERNEL_WORK_GROUP_SIZE, sizeof(size_t), (void *) &local_size, NULL); CHKERR(err, "Failed to retrieve kernel_time_step work group info!"); err = clEnqueueNDRangeKernel(commands, kernel_time_step, 1, NULL, &global_size, NULL, 0, NULL, &ocdTempEvent); clFinish(commands); START_TIMER(ocdTempEvent, OCD_TIMER_KERNEL, "CFD Time Step Kernel", ocdTempTimer) END_TIMER(ocdTempTimer) CHKERR(err, "Failed to execute kernel [kernel_time_step]!"); } } clFinish(commands); std::cout << "Finished" << std::endl; std::cout << "Saving solution..." << std::endl; dump(commands, variables, nel, nelr); std::cout << "Saved solution..." << std::endl; std::cout << "Cleaning up..." << std::endl; clReleaseProgram(program); clReleaseKernel(kernel_compute_flux); clReleaseKernel(kernel_compute_flux_contributions); clReleaseKernel(kernel_compute_step_factor); clReleaseKernel(kernel_time_step); clReleaseKernel(kernel_initialize_variables); clReleaseCommandQueue(commands); clReleaseContext(context); dealloc<float>(areas); dealloc<int>(elements_surrounding_elements); dealloc<float>(normals); dealloc<float>(variables); dealloc<float>(old_variables); dealloc<float>(fluxes); dealloc<float>(step_factors); dealloc<float>(fc_momentum_x); dealloc<float>(fc_momentum_y); dealloc<float>(fc_momentum_z); dealloc<float>(fc_density_energy); std::cout << "Done..." << std::endl; ocd_finalize(); return 0; }