Esempio n. 1
0
/** Constructor of class owOpenCLSolver
 *
 *  @param position_cpp
 *  initial position buffer
 *  @param velocity_cpp
 *  initial velocity buffer
 *  @param config
 *  Contain information about simulating configuration
 *  @param elasticConnectionData_cpp
 *  buffer with info about elastic connections
 *  @param membraneData_cpp
 *  buffer with info about membranes
 *  @param particleMembranesList_cpp
 *  buffer with info about sets of membranes in which particular particle is including
 */
owOpenCLSolver::owOpenCLSolver(const float * position_cpp, const float * velocity_cpp, owConfigProperty * config, const float * elasticConnectionsData_cpp, const int * membraneData_cpp, const int * particleMembranesList_cpp)
{
	try{
		initializeOpenCL(config);
		// Create OpenCL buffers
		initializeBuffers(position_cpp, velocity_cpp, config, elasticConnectionsData_cpp, membraneData_cpp, particleMembranesList_cpp);
		// Create OpenCL kernels
		create_ocl_kernel("clearBuffers", clearBuffers);
		create_ocl_kernel("findNeighbors", findNeighbors);
		create_ocl_kernel("hashParticles", hashParticles);
		create_ocl_kernel("indexx", indexx);
		create_ocl_kernel("sortPostPass", sortPostPass);
		// Additional PCISPH-related kernels
		create_ocl_kernel("pcisph_computeForcesAndInitPressure", pcisph_computeForcesAndInitPressure);
		create_ocl_kernel("pcisph_integrate", pcisph_integrate);
		create_ocl_kernel("pcisph_predictPositions", pcisph_predictPositions);
		create_ocl_kernel("pcisph_predictDensity", pcisph_predictDensity);
		create_ocl_kernel("pcisph_correctPressure", pcisph_correctPressure);
		create_ocl_kernel("pcisph_computePressureForceAcceleration", pcisph_computePressureForceAcceleration);
		create_ocl_kernel("pcisph_computeDensity", pcisph_computeDensity);
		create_ocl_kernel("pcisph_computeElasticForces", pcisph_computeElasticForces);
		// membrane handling kernels
		create_ocl_kernel("clearMembraneBuffers",clearMembraneBuffers);
		create_ocl_kernel("computeInteractionWithMembranes",computeInteractionWithMembranes);
		create_ocl_kernel("computeInteractionWithMembranes_finalize",computeInteractionWithMembranes_finalize);
	}catch(std::runtime_error & ex){
		destroy();
		throw ex;
	}
}
/*
  IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature

  Input Parameters:
+ Ne - The total number of cells, Nchunk * Ncb * Nbc
. Ncb - The number of serial cell batches
. Nbc - The number of cells per batch
. Nbl - The number of concurrent cells blocks per thread block
. coefficients - An array of the solution vector for each cell
. jacobianInverses - An array of the inverse Jacobian for each cell
. jacobianDeterminants - An array of the Jacobian determinant for each cell
. event - A PetscEvent, used to log flops
- debug - A flag for debugging information

  Output Parameter:
. elemVec - An array of the element vectors for each cell
*/
PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[],
                                                     const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[],
                                                     PetscLogEvent event, PetscInt debug, PetscInt pde_op)
{
  const cl_int numQuadraturePoints_0 = 1;

  const cl_int numBasisFunctions_0 = 3;
  const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim;

  const cl_int dim    = spatial_dim;
  const cl_int N_b    = numBasisFunctions_0;   /* The number of basis functions */
  const cl_int N_comp = numBasisComponents_0;  /* The number of basis function components */
  const cl_int N_bt   = N_b*N_comp;            /* The total number of scalar basis functions */
  const cl_int N_q    = numQuadraturePoints_0; /* The number of quadrature points */
  const cl_int N_bst  = N_bt*N_q;              /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */
  const cl_int N_t    = N_bst*N_bl;            /* The number of threads, N_bst * N_bl */

  char            *program_buffer;
  char            build_buffer[8192];
  cl_build_status status;

  cl_event          ocl_ev;         /* The event for tracking kernel execution */
  cl_ulong          ns_start;       /* Nanoseconds counter on GPU at kernel start */
  cl_ulong          ns_end;         /* Nanoseconds counter on GPU at kernel stop */

  cl_mem            d_coefficients;
  cl_mem            d_jacobianInverses;
  cl_mem            d_jacobianDeterminants;
  cl_mem            d_elemVec;

  OpenCLEnvironment ocl_env;
  cl_program        ocl_prog;
  cl_kernel         ocl_kernel;
  size_t            ocl_source_length;
  size_t            local_work_size[3];
  size_t            global_work_size[3];
  size_t            i;
  unsigned int      x, y, z;
  PetscErrorCode    ierr;
  cl_int            ierr2;


  PetscFunctionBegin;
  ierr = initializeOpenCL(&ocl_env);CHKERRQ(ierr);
  ierr = PetscMalloc(8192 * sizeof(char), &program_buffer);CHKERRQ(ierr);
  ierr = generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);CHKERRQ(ierr);
  ocl_source_length = strlen(program_buffer);
  ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2);
  ierr = clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL);
  if (ierr != CL_SUCCESS) {
    clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL);
    printf("Build failed! Log:\n %s", build_buffer);
  }
  CHKERRQ(ierr);
  ierr = PetscFree(program_buffer);CHKERRQ(ierr);

  ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr);

  if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp);
  if (!Ne) {
    PetscStageLog     stageLog;
    PetscEventPerfLog eventLog = NULL;
    PetscInt          stage;

    ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
    ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
    ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
    /* Log performance info */
    eventLog->eventInfo[event].count++;
    eventLog->eventInfo[event].time  += 0.0;
    eventLog->eventInfo[event].flops += 0;
    PetscFunctionReturn(0);
  }

  /* Create buffers on the device and send data over */
  d_coefficients         = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt    * sizeof(PetscReal), (void*)coefficients,         &ierr);CHKERRQ(ierr);
  d_jacobianInverses     = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses,     &ierr);CHKERRQ(ierr);
  d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne         * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);CHKERRQ(ierr);
  d_elemVec              = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE,                        Ne*N_bt    * sizeof(PetscReal), NULL,                        &ierr);CHKERRQ(ierr);

  /* Work size preparations */
  ierr = calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);CHKERRQ(ierr);
  local_work_size[0] = Nbc*N_comp;
  local_work_size[1] = 1;
  local_work_size[2] = 1;
  global_work_size[0] = x * local_work_size[0];
  global_work_size[1] = y * local_work_size[1];
  global_work_size[2] = z * local_work_size[2];

  /* if (debug) { */
  ierr = PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n",
                     x, y, z,
                     local_work_size[0], local_work_size[1], local_work_size[2], Ncb);CHKERRQ(ierr);
  ierr = PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb);
  /* } */

  /* Kernel launch */
  /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */
  ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);CHKERRQ(ierr);
  ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);CHKERRQ(ierr);

  ierr = clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr);

  /* Read data back from device */
  ierr = clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);CHKERRQ(ierr);

  {
    PetscStageLog     stageLog;
    PetscEventPerfLog eventLog = NULL;
    PetscInt          stage;

    ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr);
    ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr);
    ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr);
    /* Log performance info */
    ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr);
    ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END,   sizeof(cl_ulong), &ns_end,   NULL);CHKERRQ(ierr);
    eventLog->eventInfo[event].count++;
    eventLog->eventInfo[event].time  += (ns_end - ns_start)*1.0e-9;
    eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne;
  }

  /* We are done, clean up */
  ierr = clReleaseMemObject(d_coefficients);CHKERRQ(ierr);
  ierr = clReleaseMemObject(d_jacobianInverses);CHKERRQ(ierr);
  ierr = clReleaseMemObject(d_jacobianDeterminants);CHKERRQ(ierr);
  ierr = clReleaseMemObject(d_elemVec);CHKERRQ(ierr);
  ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr);
  ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr);
  ierr = destroyOpenCL(&ocl_env);CHKERRQ(ierr);
  PetscFunctionReturn(0);
}