PetscErrorCode PetscLogEventBeginTrace(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventRegLog eventRegLog; PetscEventPerfLog eventPerfLog = NULL; PetscLogDouble cur_time; PetscMPIInt rank; int stage,err; PetscErrorCode ierr; PetscFunctionBegin; if (!petsc_tracetime) PetscTime(&petsc_tracetime); ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); /* Check for double counting */ eventPerfLog->eventInfo[event].depth++; petsc_tracelevel++; if (eventPerfLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ PetscTime(&cur_time); ierr = PetscFPrintf(PETSC_COMM_SELF,petsc_tracefile, "%s[%d] %g Event begin: %s\n", petsc_tracespace, rank, cur_time-petsc_tracetime, eventRegLog->eventInfo[event].name);CHKERRQ(ierr); ierr = PetscStrncpy(petsc_tracespace, petsc_traceblanks, 2*petsc_tracelevel);CHKERRQ(ierr); petsc_tracespace[2*petsc_tracelevel] = 0; err = fflush(petsc_tracefile); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file"); PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndTrace(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) { PetscStageLog stageLog; PetscEventRegLog eventRegLog; PetscEventPerfLog eventPerfLog = NULL; PetscLogDouble cur_time; int stage,err; PetscMPIInt rank; PetscErrorCode ierr; PetscFunctionBegin; petsc_tracelevel--; ierr = MPI_Comm_rank(PETSC_COMM_WORLD, &rank);CHKERRQ(ierr); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); /* Check for double counting */ eventPerfLog->eventInfo[event].depth--; if (eventPerfLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventPerfLog->eventInfo[event].depth < 0 || petsc_tracelevel < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); /* Log performance info */ ierr = PetscStrncpy(petsc_tracespace, petsc_traceblanks, 2*petsc_tracelevel);CHKERRQ(ierr); petsc_tracespace[2*petsc_tracelevel] = 0; PetscTime(&cur_time); ierr = PetscFPrintf(PETSC_COMM_SELF,petsc_tracefile, "%s[%d] %g Event end: %s\n", petsc_tracespace, rank, cur_time-petsc_tracetime, eventRegLog->eventInfo[event].name);CHKERRQ(ierr); err = fflush(petsc_tracefile); if (err) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SYS,"fflush() failed on file"); PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndDefault(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth--; if (eventLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE,"Logging event had unbalanced begin/end pairs"); /* Log performance info */ PetscTimeAdd(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].time += eventLog->eventInfo[event].timeTmp; eventLog->eventInfo[event].time2 += eventLog->eventInfo[event].timeTmp*eventLog->eventInfo[event].timeTmp; eventLog->eventInfo[event].flopsTmp += petsc_TotalFlops; eventLog->eventInfo[event].flops += eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].flops2 += eventLog->eventInfo[event].flopsTmp*eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].numMessages += petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndComplete(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventRegLog eventRegLog; PetscEventPerfLog eventPerfLog = NULL; Action *tmpAction; PetscLogDouble start, end; PetscLogDouble curTime; int stage; PetscErrorCode ierr; PetscFunctionBegin; /* Dynamically enlarge logging structures */ if (petsc_numActions >= petsc_maxActions) { PetscTime(&start); ierr = PetscMalloc(petsc_maxActions*2 * sizeof(Action), &tmpAction);CHKERRQ(ierr); ierr = PetscMemcpy(tmpAction, petsc_actions, petsc_maxActions * sizeof(Action));CHKERRQ(ierr); ierr = PetscFree(petsc_actions);CHKERRQ(ierr); petsc_actions = tmpAction; petsc_maxActions *= 2; PetscTime(&end); petsc_BaseTime += (end - start); } /* Record the event */ ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventRegLog(stageLog, &eventRegLog);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventPerfLog);CHKERRQ(ierr); PetscTime(&curTime); if (petsc_logActions) { petsc_actions[petsc_numActions].time = curTime - petsc_BaseTime; petsc_actions[petsc_numActions].action = ACTIONEND; petsc_actions[petsc_numActions].event = event; petsc_actions[petsc_numActions].classid = eventRegLog->eventInfo[event].classid; if (o1) petsc_actions[petsc_numActions].id1 = o1->id; else petsc_actions[petsc_numActions].id1 = -1; if (o2) petsc_actions[petsc_numActions].id2 = o2->id; else petsc_actions[petsc_numActions].id2 = -1; if (o3) petsc_actions[petsc_numActions].id3 = o3->id; else petsc_actions[petsc_numActions].id3 = -1; petsc_actions[petsc_numActions].flops = petsc_TotalFlops; ierr = PetscMallocGetCurrentUsage(&petsc_actions[petsc_numActions].mem);CHKERRQ(ierr); ierr = PetscMallocGetMaximumUsage(&petsc_actions[petsc_numActions].maxmem);CHKERRQ(ierr); petsc_numActions++; } /* Check for double counting */ eventPerfLog->eventInfo[event].depth--; if (eventPerfLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventPerfLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); /* Log the performance info */ eventPerfLog->eventInfo[event].count++; eventPerfLog->eventInfo[event].time += curTime; eventPerfLog->eventInfo[event].flops += petsc_TotalFlops; eventPerfLog->eventInfo[event].numMessages += petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventPerfLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventPerfLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event,int t,PetscObject o1,PetscObject o2,PetscObject o3,PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth++; if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].timeTmp = 0.0; PetscTimeSubtract(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].flopsTmp = 0.0; eventLog->eventInfo[event].flopsTmp -= petsc_TotalFlops; eventLog->eventInfo[event].numMessages -= petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength -= petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions -= petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventGetFlops(PetscLogEvent event, PetscLogDouble *flops) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); *flops = eventLog->eventInfo[event].flops; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventGetFlops(PetscLogEvent event,PetscLogDouble *flops) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; if (!PetscLogPLB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Must use -log_summary or PetscLogDefaultBegin() before calling this routine"); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); *flops = eventLog->eventInfo[event].flops; PetscFunctionReturn(0); }
/*@C PetscLogEventGetPerfInfo - Return the performance information about the given event in the given stage Input Parameters: + stage - The stage number or PETSC_DETERMINE for the current stage - event - The event number Output Parameters: . info - This structure is filled with the performance information Level: Intermediate .seealso: PetscLogEventGetFlops() @*/ PetscErrorCode PetscLogEventGetPerfInfo(int stage,PetscLogEvent event,PetscEventPerfInfo *info) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscErrorCode ierr; PetscFunctionBegin; PetscValidPointer(info,3); if (!PetscLogPLB) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_SUP,"Must use -log_summary or PetscLogDefaultBegin() before calling this routine"); ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); if (stage < 0) {ierr = PetscStageLogGetCurrent(stageLog,&stage);CHKERRQ(ierr);} ierr = PetscStageLogGetEventPerfLog(stageLog,stage,&eventLog);CHKERRQ(ierr); *info = eventLog->eventInfo[event]; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventZeroFlops(PetscLogEvent event) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); eventLog->eventInfo[event].flops = 0.0; eventLog->eventInfo[event].flops2 = 0.0; eventLog->eventInfo[event].flopsTmp = 0.0; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventEndDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth--; if (eventLog->eventInfo[event].depth > 0) PetscFunctionReturn(0); else if (eventLog->eventInfo[event].depth < 0) SETERRQ(PETSC_COMM_SELF,PETSC_ERR_ARG_WRONGSTATE, "Logging event had unbalanced begin/end pairs"); /* Log performance info */ PetscTimeAdd(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].time += eventLog->eventInfo[event].timeTmp; eventLog->eventInfo[event].time2 += eventLog->eventInfo[event].timeTmp*eventLog->eventInfo[event].timeTmp; #if defined(PETSC_HAVE_CHUD) eventLog->eventInfo[event].flopsTmp += chudGetPMCEventCount(chudCPU1Dev,PMC_1); #elif defined(PETSC_HAVE_PAPI) { long_long values[2]; ierr = PAPI_read(PAPIEventSet,values);CHKERRQ(ierr); eventLog->eventInfo[event].flopsTmp += values[0]; /* printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */ } #else eventLog->eventInfo[event].flopsTmp += petsc_TotalFlops; #endif eventLog->eventInfo[event].flops += eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].flops2 += eventLog->eventInfo[event].flopsTmp*eventLog->eventInfo[event].flopsTmp; eventLog->eventInfo[event].numMessages += petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength += petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions += petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
PetscErrorCode PetscLogEventBeginDefault(PetscLogEvent event, int t, PetscObject o1, PetscObject o2, PetscObject o3, PetscObject o4) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; int stage; PetscErrorCode ierr; PetscFunctionBegin; ierr = PetscLogGetStageLog(&stageLog); CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage); CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog); CHKERRQ(ierr); /* Check for double counting */ eventLog->eventInfo[event].depth++; if (eventLog->eventInfo[event].depth > 1) PetscFunctionReturn(0); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].timeTmp = 0.0; PetscTimeSubtract(&eventLog->eventInfo[event].timeTmp); eventLog->eventInfo[event].flopsTmp = 0.0; #if defined(PETSC_HAVE_PAPI) { long_long values[2]; ierr = PAPI_read(PAPIEventSet,values); CHKERRQ(ierr); eventLog->eventInfo[event].flopsTmp -= values[0]; /* printf("fma %g flops %g\n",(double)values[1],(double)values[0]); */ } #else eventLog->eventInfo[event].flopsTmp -= petsc_TotalFlops; #endif eventLog->eventInfo[event].numMessages -= petsc_irecv_ct + petsc_isend_ct + petsc_recv_ct + petsc_send_ct; eventLog->eventInfo[event].messageLength -= petsc_irecv_len + petsc_isend_len + petsc_recv_len + petsc_send_len; eventLog->eventInfo[event].numReductions -= petsc_allreduce_ct + petsc_gather_ct + petsc_scatter_ct; PetscFunctionReturn(0); }
/* IntegrateElementBatchOpenCL - Produces element vectors from input element solution and geometric information via quadrature Input Parameters: + Ne - The total number of cells, Nchunk * Ncb * Nbc . Ncb - The number of serial cell batches . Nbc - The number of cells per batch . Nbl - The number of concurrent cells blocks per thread block . coefficients - An array of the solution vector for each cell . jacobianInverses - An array of the inverse Jacobian for each cell . jacobianDeterminants - An array of the Jacobian determinant for each cell . event - A PetscEvent, used to log flops - debug - A flag for debugging information Output Parameter: . elemVec - An array of the element vectors for each cell */ PETSC_EXTERN PetscErrorCode IntegrateElementBatchGPU(PetscInt spatial_dim, PetscInt Ne, PetscInt Ncb, PetscInt Nbc, PetscInt N_bl, const PetscScalar coefficients[], const PetscReal jacobianInverses[], const PetscReal jacobianDeterminants[], PetscScalar elemVec[], PetscLogEvent event, PetscInt debug, PetscInt pde_op) { const cl_int numQuadraturePoints_0 = 1; const cl_int numBasisFunctions_0 = 3; const cl_int numBasisComponents_0 = (pde_op == LAPLACIAN) ? 1 : spatial_dim; const cl_int dim = spatial_dim; const cl_int N_b = numBasisFunctions_0; /* The number of basis functions */ const cl_int N_comp = numBasisComponents_0; /* The number of basis function components */ const cl_int N_bt = N_b*N_comp; /* The total number of scalar basis functions */ const cl_int N_q = numQuadraturePoints_0; /* The number of quadrature points */ const cl_int N_bst = N_bt*N_q; /* The block size, LCM(N_bt, N_q), Notice that a block is not process simultaneously */ const cl_int N_t = N_bst*N_bl; /* The number of threads, N_bst * N_bl */ char *program_buffer; char build_buffer[8192]; cl_build_status status; cl_event ocl_ev; /* The event for tracking kernel execution */ cl_ulong ns_start; /* Nanoseconds counter on GPU at kernel start */ cl_ulong ns_end; /* Nanoseconds counter on GPU at kernel stop */ cl_mem d_coefficients; cl_mem d_jacobianInverses; cl_mem d_jacobianDeterminants; cl_mem d_elemVec; OpenCLEnvironment ocl_env; cl_program ocl_prog; cl_kernel ocl_kernel; size_t ocl_source_length; size_t local_work_size[3]; size_t global_work_size[3]; size_t i; unsigned int x, y, z; PetscErrorCode ierr; cl_int ierr2; PetscFunctionBegin; ierr = initializeOpenCL(&ocl_env);CHKERRQ(ierr); ierr = PetscMalloc(8192 * sizeof(char), &program_buffer);CHKERRQ(ierr); ierr = generateOpenCLSource(&program_buffer, 8192, dim, N_bl, pde_op);CHKERRQ(ierr); ocl_source_length = strlen(program_buffer); ocl_prog = clCreateProgramWithSource(ocl_env.ctx_id, 1, (const char**)&program_buffer, &ocl_source_length, &ierr2);CHKERRQ(ierr2); ierr = clBuildProgram(ocl_prog, 0, NULL, NULL, NULL, NULL); if (ierr != CL_SUCCESS) { clGetProgramBuildInfo(ocl_prog, ocl_env.dev_id, CL_PROGRAM_BUILD_LOG, sizeof(char)*8192, &build_buffer, NULL); printf("Build failed! Log:\n %s", build_buffer); } CHKERRQ(ierr); ierr = PetscFree(program_buffer);CHKERRQ(ierr); ocl_kernel = clCreateKernel(ocl_prog, "integrateElementQuadrature", &ierr);CHKERRQ(ierr); if (Nbc*N_comp != N_t) SETERRQ3(PETSC_COMM_SELF, PETSC_ERR_PLIB, "Number of threads %d should be %d * %d", N_t, Nbc, N_comp); if (!Ne) { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscInt stage; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Log performance info */ eventLog->eventInfo[event].count++; eventLog->eventInfo[event].time += 0.0; eventLog->eventInfo[event].flops += 0; PetscFunctionReturn(0); } /* Create buffers on the device and send data over */ d_coefficients = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*N_bt * sizeof(PetscReal), (void*)coefficients, &ierr);CHKERRQ(ierr); d_jacobianInverses = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne*dim*dim * sizeof(PetscReal), (void*)jacobianInverses, &ierr);CHKERRQ(ierr); d_jacobianDeterminants = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, Ne * sizeof(PetscReal), (void*)jacobianDeterminants, &ierr);CHKERRQ(ierr); d_elemVec = clCreateBuffer(ocl_env.ctx_id, CL_MEM_READ_WRITE, Ne*N_bt * sizeof(PetscReal), NULL, &ierr);CHKERRQ(ierr); /* Work size preparations */ ierr = calculateGridOpenCL(Ne, Ncb*Nbc, &x, &y, &z);CHKERRQ(ierr); local_work_size[0] = Nbc*N_comp; local_work_size[1] = 1; local_work_size[2] = 1; global_work_size[0] = x * local_work_size[0]; global_work_size[1] = y * local_work_size[1]; global_work_size[2] = z * local_work_size[2]; /* if (debug) { */ ierr = PetscPrintf(PETSC_COMM_SELF, "GPU layout grid(%d,%d,%d) block(%d,%d,%d) with %d batches\n", x, y, z, local_work_size[0], local_work_size[1], local_work_size[2], Ncb);CHKERRQ(ierr); ierr = PetscPrintf(PETSC_COMM_SELF, " N_t: %d, N_cb: %d\n", N_t, Ncb); /* } */ /* Kernel launch */ /* integrateElementQuadrature<<<grid, block>>>(Ncb, d_coefficients, d_jacobianInverses, d_jacobianDeterminants, d_elemVec); */ ierr = clSetKernelArg(ocl_kernel, 0, sizeof(cl_int), (void*)&Ncb);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 1, sizeof(cl_mem), (void*)&d_coefficients);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 2, sizeof(cl_mem), (void*)&d_jacobianInverses);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 3, sizeof(cl_mem), (void*)&d_jacobianDeterminants);CHKERRQ(ierr); ierr = clSetKernelArg(ocl_kernel, 4, sizeof(cl_mem), (void*)&d_elemVec);CHKERRQ(ierr); ierr = clEnqueueNDRangeKernel(ocl_env.queue_id, ocl_kernel, 3, NULL, global_work_size, local_work_size, 0, NULL, &ocl_ev);CHKERRQ(ierr); /* Read data back from device */ ierr = clEnqueueReadBuffer(ocl_env.queue_id, d_elemVec, CL_TRUE, 0, Ne*N_bt * sizeof(PetscReal), elemVec, 0, NULL, NULL);CHKERRQ(ierr); { PetscStageLog stageLog; PetscEventPerfLog eventLog = NULL; PetscInt stage; ierr = PetscLogGetStageLog(&stageLog);CHKERRQ(ierr); ierr = PetscStageLogGetCurrent(stageLog, &stage);CHKERRQ(ierr); ierr = PetscStageLogGetEventPerfLog(stageLog, stage, &eventLog);CHKERRQ(ierr); /* Log performance info */ ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_START, sizeof(cl_ulong), &ns_start, NULL);CHKERRQ(ierr); ierr = clGetEventProfilingInfo(ocl_ev, CL_PROFILING_COMMAND_END, sizeof(cl_ulong), &ns_end, NULL);CHKERRQ(ierr); eventLog->eventInfo[event].count++; eventLog->eventInfo[event].time += (ns_end - ns_start)*1.0e-9; eventLog->eventInfo[event].flops += (((2+(2+2*dim)*dim)*N_comp*N_b+(2+2)*dim*N_comp)*N_q + (2+2*dim)*dim*N_q*N_comp*N_b)*Ne; } /* We are done, clean up */ ierr = clReleaseMemObject(d_coefficients);CHKERRQ(ierr); ierr = clReleaseMemObject(d_jacobianInverses);CHKERRQ(ierr); ierr = clReleaseMemObject(d_jacobianDeterminants);CHKERRQ(ierr); ierr = clReleaseMemObject(d_elemVec);CHKERRQ(ierr); ierr = clReleaseKernel(ocl_kernel);CHKERRQ(ierr); ierr = clReleaseProgram(ocl_prog);CHKERRQ(ierr); ierr = destroyOpenCL(&ocl_env);CHKERRQ(ierr); PetscFunctionReturn(0); }