/** Constructor of class owOpenCLSolver * * @param position_cpp * initial position buffer * @param velocity_cpp * initial velocity buffer * @param config * Contain information about simulating configuration * @param elasticConnectionData_cpp * buffer with info about elastic connections * @param membraneData_cpp * buffer with info about membranes * @param particleMembranesList_cpp * buffer with info about sets of membranes in which particular particle is including */ owOpenCLSolver::owOpenCLSolver(const float * position_cpp, const float * velocity_cpp, owConfigProperty * config, const float * elasticConnectionsData_cpp, const int * membraneData_cpp, const int * particleMembranesList_cpp) { try{ initializeOpenCL(config); // Create OpenCL buffers initializeBuffers(position_cpp, velocity_cpp, config, elasticConnectionsData_cpp, membraneData_cpp, particleMembranesList_cpp); // Create OpenCL kernels create_ocl_kernel("clearBuffers", clearBuffers); create_ocl_kernel("findNeighbors", findNeighbors); create_ocl_kernel("hashParticles", hashParticles); create_ocl_kernel("indexx", indexx); create_ocl_kernel("sortPostPass", sortPostPass); // Additional PCISPH-related kernels create_ocl_kernel("pcisph_computeForcesAndInitPressure", pcisph_computeForcesAndInitPressure); create_ocl_kernel("pcisph_integrate", pcisph_integrate); create_ocl_kernel("pcisph_predictPositions", pcisph_predictPositions); create_ocl_kernel("pcisph_predictDensity", pcisph_predictDensity); create_ocl_kernel("pcisph_correctPressure", pcisph_correctPressure); create_ocl_kernel("pcisph_computePressureForceAcceleration", pcisph_computePressureForceAcceleration); create_ocl_kernel("pcisph_computeDensity", pcisph_computeDensity); create_ocl_kernel("pcisph_computeElasticForces", pcisph_computeElasticForces); // membrane handling kernels create_ocl_kernel("clearMembraneBuffers",clearMembraneBuffers); create_ocl_kernel("computeInteractionWithMembranes",computeInteractionWithMembranes); create_ocl_kernel("computeInteractionWithMembranes_finalize",computeInteractionWithMembranes_finalize); }catch(std::runtime_error & ex){ destroy(); throw ex; } }
/* IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature Input Parameters: + Ne - The total number of cells, Nchunk * Ncb * Nbc . Ncb - The number of serial cell batches . Nbc - The number of cells per batch . Nbl - The number of concurrent cells blocks per thread block . coefficients - An array of the solution vector for each cell . jacobianInverses - An array of the inverse Jacobian for each cell . jacobianDeterminants - An array of the Jacobian determinant for each cell . event - A PetscEvent, used to log flops - debug - A flag for debugging information Output Parameter: . elemVec - An array of the element vectors for each cell */ PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[], const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[], PetscLogEvent event, PetscInt debug, PetscInt pde_op) { const cl_int numQuadraturePoints_0 = 1; const cl_int numBasisFunctions_0 = 3; const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim; const cl_int dim = spatial_dim; const cl_int N_b = numBasisFunctions_0; /* The number of basis functions */ const cl_int N_comp = numBasisComponents_0; /* The number of basis function components */ const cl_int N_bt = N_b*N_comp; /* The total number of scalar basis functions */ const cl_int N_q = numQuadraturePoints_0; /* The number of quadrature points */ const cl_int N_bst = N_bt*N_q; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ const cl_int N_t = N_bst*N_bl; /* The number of threads, N_bst * N_bl */ char *program_buffer; char build_buffer[8192]; cl_build_status status; cl_event ocl_ev; /* The event for tracking kernel execution */ cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ cl_mem d_coefficients; cl_mem d_jacobianInverses; cl_mem d_jacobianDeterminants; cl_mem d_elemVec; OpenCLEnvironment ocl_env; cl_program ocl_prog; cl_kernel ocl_kernel; size_t ocl_source_length; size_t local_work_size[3]; size_t global_work_size[3]; size_t i; unsigned int x, y, z; PetscErrorCode ierr; cl_int ierr2; PetscFunctionBegin; ierr = initializeOpenCL(&ocl_env);CHKERRQ(ierr); ierr = PetscMalloc(8192 * sizeof(char), &program_buffer);CHKERRQ(ierr); ierr = generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);CHKERRQ(ierr); ocl_source_length = strlen(program_buffer); ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2); ierr = clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL); if (ierr != CL_SUCCESS) { clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL); printf("Build failed! Log:\n %s", build_buffer); } CHKERRQ(ierr); ierr = PetscFree(program_buffer);CHKERRQ(ierr); ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr); if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp); if (!Ne) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscInt stage; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].time += 0.0; eventLog->eventInfo[event].flops += 0; PetscFunctionReturn(0); } /* Create buffers on the device and send data over */ d_coefficients = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt * sizeof(PetscReal), (void*)coefficients, &ierr);CHKERRQ(ierr); d_jacobianInverses = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses, &ierr);CHKERRQ(ierr); d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);CHKERRQ(ierr); d_elemVec = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE, Ne*N_bt * sizeof(PetscReal), NULL, &ierr);CHKERRQ(ierr); /* Work size preparations */ ierr = calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);CHKERRQ(ierr); local_work_size[0] = Nbc*N_comp; local_work_size[1] = 1; local_work_size[2] = 1; global_work_size[0] = x * local_work_size[0]; global_work_size[1] = y * local_work_size[1]; global_work_size[2] = z * local_work_size[2]; /* if (debug) { */ ierr = PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], Ncb);CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb); /* } */ /* Kernel launch */ /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */ ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);CHKERRQ(ierr); ierr = clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr); /* Read data back from device */ ierr = clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);CHKERRQ(ierr); { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscInt stage; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Log performance info */ ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr); ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr); eventLog->eventInfo[event].count++; eventLog->eventInfo[event].time += (ns_end - ns_start)*1.0e-9; eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne; } /* We are done, clean up */ ierr = clReleaseMemObject(d_coefficients);CHKERRQ(ierr); ierr = clReleaseMemObject(d_jacobianInverses);CHKERRQ(ierr); ierr = clReleaseMemObject(d_jacobianDeterminants);CHKERRQ(ierr); ierr = clReleaseMemObject(d_elemVec);CHKERRQ(ierr); ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr); ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr); ierr = destroyOpenCL(&ocl_env);CHKERRQ(ierr); PetscFunctionReturn(0); }