void print_kernel_invocation(cl_kernel entry) { assert(kernels().count(entry) == 1 && "Kernel not found(2)"); kernelInfo_t *info = &kernels()[entry]; unsigned work_dim = info->work_dim; unsigned *global_work_size = info->global_work_size; unsigned *local_work_size = info->local_work_size; printf("\nSENTINEL %s ", info->name); if (work_dim == 1) { printf("--global_size=%d ", global_work_size[0]); printf("--local_size=%d ", local_work_size[0]); } else if (work_dim == 2) { printf("--global_size=[%d,%d] ", global_work_size[0], global_work_size[1]); printf("--local_size=[%d,%d] ", local_work_size[0], local_work_size[1]); } else if (work_dim == 3) { printf("--global_size=[%d,%d,%d] ", global_work_size[0], global_work_size[1], global_work_size[2]); printf("--local_size=[%d,%d,%d] ", local_work_size[0], local_work_size[1], local_work_size[2]); } for (std::vector<void *>::iterator it = info->args.begin(), end = info->args.end(); it != end; ++it) { unsigned i = std::distance(info->args.begin(), it); int *x = static_cast<int *>(*it); if (x) printf("%d:%d ", i, *x); else printf("%d:- ", i); } printf("\n"); }
std::unique_ptr<typename ElementaryPotentialOperator< BasisFunctionType, KernelType, ResultType>::LocalAssembler> ElementaryPotentialOperator<BasisFunctionType, KernelType, ResultType>:: makeAssembler(const Space<BasisFunctionType> &space, const arma::Mat<CoordinateType> &evaluationPoints, const QuadratureStrategy &quadStrategy, const EvaluationOptions &options) const { // Collect the standard set of data necessary for construction of // assemblers typedef Fiber::RawGridGeometry<CoordinateType> RawGridGeometry; typedef std::vector<const Fiber::Shapeset<BasisFunctionType> *> ShapesetPtrVector; typedef std::vector<std::vector<ResultType>> CoefficientsVector; typedef LocalAssemblerConstructionHelper Helper; shared_ptr<RawGridGeometry> rawGeometry; shared_ptr<GeometryFactory> geometryFactory; shared_ptr<Fiber::OpenClHandler> openClHandler; shared_ptr<ShapesetPtrVector> shapesets; shared_ptr<const Grid> grid = space.grid(); Helper::collectGridData(space, rawGeometry, geometryFactory); Helper::makeOpenClHandler(options.parallelizationOptions().openClOptions(), rawGeometry, openClHandler); Helper::collectShapesets(space, shapesets); // Now create the assembler return quadStrategy.makeAssemblerForPotentialOperators( evaluationPoints, geometryFactory, rawGeometry, shapesets, make_shared_from_ref(kernels()), make_shared_from_ref(trialTransformations()), make_shared_from_ref(integral()), openClHandler, options.parallelizationOptions(), options.verbosityLevel()); }
CLWProgram::CLWProgram(cl_program program) : ReferenceCounter<cl_program, clRetainProgram, clReleaseProgram>(program) { cl_int status = CL_SUCCESS; cl_uint numKernels; status = clCreateKernelsInProgram(*this, 0, nullptr, &numKernels); ThrowIf(numKernels == 0, CL_BUILD_ERROR, "clCreateKernelsInProgram return 0 kernels"); ThrowIf(status != CL_SUCCESS, status, "clCreateKernelsInProgram failed"); std::vector<cl_kernel> kernels(numKernels); status = clCreateKernelsInProgram(*this, numKernels, &kernels[0], nullptr); ThrowIf(status != CL_SUCCESS, status, "clCreateKernelsInProgram failed"); std::for_each(kernels.begin(), kernels.end(), [this](cl_kernel k) { size_t size = 0; cl_int res; res = clGetKernelInfo(k, CL_KERNEL_FUNCTION_NAME, 0, nullptr, &size); ThrowIf(res != CL_SUCCESS, res, "clGetKernelInfo failed"); std::vector<char> temp(size); res = clGetKernelInfo(k, CL_KERNEL_FUNCTION_NAME, size, &temp[0], nullptr); ThrowIf(res != CL_SUCCESS, res, "clGetKernelInfo failed"); std::string funcName(temp.begin(), temp.end()-1); kernels_[funcName] = CLWKernel::Create(k); }); }
cl_int clSetKernelArg(cl_kernel kernel, cl_uint arg_index, size_t arg_size, const void *arg_value) { assert(kernels().count(kernel) == 1 && "Kernel not found(0)"); if (kernels()[kernel].args.size() < arg_index+1) { kernels()[kernel].args.resize(arg_index + 1); } if (arg_value) { kernels()[kernel].args[arg_index] = const_cast<void *>(arg_value); } else { kernels()[kernel].args[arg_index] = NULL; } if (realClSetKernelArg == NULL) realClSetKernelArg = (clSetKernelArg_t)dlsym(RTLD_NEXT,"clSetKernelArg"); assert(realClSetKernelArg != NULL && "clSetKernelArg is null"); return realClSetKernelArg(kernel, arg_index, arg_size, arg_value); }
void print_kernel_invocation(const char *entry) { dim3 gridDim = kernelInfo().gridDim; dim3 blockDim = kernelInfo().blockDim; printf("SENTINEL %s ", kernels()[entry]); if (gridDim.y == 1 && gridDim.z == 1) { printf("--gridDim=%d ", gridDim.x); } else if (gridDim.z == 1) { printf("--gridDim=[%d,%d] ", gridDim.x, gridDim.y); } else { printf("--gridDim=[%d,%d,%d] ", gridDim.x, gridDim.y, gridDim.z); } if (blockDim.y == 1 && blockDim.z == 1) { printf("--blockDim=%d ", blockDim.x); } else if (blockDim.z == 1) { printf("--blockDim=[%d,%d] ", blockDim.x, blockDim.y); } else { printf("--blockDim=[%d,%d,%d] ", blockDim.x, blockDim.y, blockDim.z); } for (std::list<void *>::iterator it = kernelInfo().args.begin(), end = kernelInfo().args.end(); it != end; ++it) { unsigned i = std::distance(kernelInfo().args.begin(), it); printf("%d:%d ", i, *(static_cast<int *>(*it))); } printf("\n"); }
cl_kernel clCreateKernel(cl_program program, const char *kernel_name, cl_int *errcode_ret) { if (realClCreateKernel == NULL) realClCreateKernel = (clCreateKernel_t)dlsym(RTLD_NEXT,"clCreateKernel"); assert(realClCreateKernel != NULL && "clCreateKernel is null"); cl_kernel k = realClCreateKernel(program, kernel_name, errcode_ret); kernels()[k].name = kernel_name; return k; }
cl_int clEnqueueNDRangeKernel( cl_command_queue command_queue, cl_kernel kernel, cl_uint work_dim, const size_t *global_work_offset, const size_t *global_work_size, const size_t *local_work_size, cl_uint num_events_in_wait_list, const cl_event *event_wait_list, cl_event *event) { assert(kernels().count(kernel) == 1 && "Kernel not found(1)"); assert(work_dim <= 3 && "work_dim is too large"); kernels()[kernel].work_dim = work_dim; for (unsigned int i=0; i<work_dim; i++) { kernels()[kernel].global_work_size[i] = global_work_size[i]; kernels()[kernel].local_work_size[i] = local_work_size == NULL ? 0 : local_work_size[i]; } print_kernel_invocation(kernel); if (realClEnqueueNDRangeKernel == NULL) realClEnqueueNDRangeKernel = (clEnqueueNDRangeKernel_t)dlsym(RTLD_NEXT,"clEnqueueNDRangeKernel"); assert(realClEnqueueNDRangeKernel != NULL && "clEnqueueNDRangeKernel is null"); return realClEnqueueNDRangeKernel( command_queue, kernel, work_dim, global_work_offset, global_work_size, local_work_size, num_events_in_wait_list, event_wait_list, event); }
std::auto_ptr<typename ElementaryPotentialOperator< BasisFunctionType, KernelType, ResultType>::Evaluator> ElementaryPotentialOperator<BasisFunctionType, KernelType, ResultType>:: makeEvaluator( const GridFunction<BasisFunctionType, ResultType>& argument, const QuadratureStrategy& quadStrategy, const EvaluationOptions& options) const { // Collect the standard set of data necessary for construction of // evaluators and assemblers typedef Fiber::RawGridGeometry<CoordinateType> RawGridGeometry; typedef std::vector<const Fiber::Shapeset<BasisFunctionType>*> ShapesetPtrVector; typedef std::vector<std::vector<ResultType> > CoefficientsVector; typedef LocalAssemblerConstructionHelper Helper; shared_ptr<RawGridGeometry> rawGeometry; shared_ptr<GeometryFactory> geometryFactory; shared_ptr<Fiber::OpenClHandler> openClHandler; shared_ptr<ShapesetPtrVector> shapesets; const Space<BasisFunctionType>& space = *argument.space(); shared_ptr<const Grid> grid = space.grid(); Helper::collectGridData(space, rawGeometry, geometryFactory); Helper::makeOpenClHandler(options.parallelizationOptions().openClOptions(), rawGeometry, openClHandler); Helper::collectShapesets(space, shapesets); // In addition, get coefficients of argument's expansion in each element const GridView& view = space.gridView(); const int elementCount = view.entityCount(0); shared_ptr<CoefficientsVector> localCoefficients = boost::make_shared<CoefficientsVector>(elementCount); std::auto_ptr<EntityIterator<0> > it = view.entityIterator<0>(); for (int i = 0; i < elementCount; ++i) { const Entity<0>& element = it->entity(); argument.getLocalCoefficients(element, (*localCoefficients)[i]); it->next(); } // Now create the evaluator return quadStrategy.makeEvaluatorForIntegralOperators( geometryFactory, rawGeometry, shapesets, make_shared_from_ref(kernels()), make_shared_from_ref(trialTransformations()), make_shared_from_ref(integral()), localCoefficients, openClHandler, options.parallelizationOptions()); }
void __cudaRegisterFunction(void **fatCubinHandle, const char *hostFun, char *deviceFun, const char *deviceName, int thread_limit, uint3 *tid, uint3 *bid, dim3 *bDim, dim3 *gDim, int *wSize) { kernels()[hostFun] = deviceFun; if (realCudaRegisterFunction == NULL) { realCudaRegisterFunction = (cudaRegisterFunction_t)dlsym(RTLD_NEXT,"__cudaRegisterFunction"); } assert(realCudaRegisterFunction != NULL && "cudaRegisterFunction is null"); realCudaRegisterFunction(fatCubinHandle, hostFun, deviceFun, deviceName, thread_limit, tid, bid, bDim, gDim, wSize); }
std::pair< shared_ptr<typename HypersingularIntegralOperator< BasisFunctionType, KernelType, ResultType>::LocalAssembler>, shared_ptr<typename HypersingularIntegralOperator< BasisFunctionType, KernelType, ResultType>::LocalAssembler> > HypersingularIntegralOperator<BasisFunctionType, KernelType, ResultType>:: reallyMakeAssemblers( const QuadratureStrategy& quadStrategy, const shared_ptr<const GeometryFactory>& testGeometryFactory, const shared_ptr<const GeometryFactory>& trialGeometryFactory, const shared_ptr<const Fiber::RawGridGeometry<CoordinateType> >& testRawGeometry, const shared_ptr<const Fiber::RawGridGeometry<CoordinateType> >& trialRawGeometry, const shared_ptr<const std::vector<const Fiber::Shapeset<BasisFunctionType>*> >& testShapesets, const shared_ptr<const std::vector<const Fiber::Shapeset<BasisFunctionType>*> >& trialShapesets, const shared_ptr<const Fiber::OpenClHandler>& openClHandler, const ParallelizationOptions& parallelizationOptions, VerbosityLevel::Level verbosityLevel, bool cacheSingularIntegrals, bool makeSeparateOffDiagonalAssembler) const { std::pair<shared_ptr<LocalAssembler>, shared_ptr<LocalAssembler> > result; // first element: "standard" assembler // second element: assembler used for admissible (off-diagonal) // H-matrix blocks in "disassembled mode" result.first.reset(quadStrategy.makeAssemblerForIntegralOperators( testGeometryFactory, trialGeometryFactory, testRawGeometry, trialRawGeometry, testShapesets, trialShapesets, make_shared_from_ref(testTransformations()), make_shared_from_ref(kernels()), make_shared_from_ref(trialTransformations()), make_shared_from_ref(integral()), openClHandler, parallelizationOptions, verbosityLevel, cacheSingularIntegrals).release()); if (makeSeparateOffDiagonalAssembler) result.second.reset(quadStrategy.makeAssemblerForIntegralOperators( testGeometryFactory, trialGeometryFactory, testRawGeometry, trialRawGeometry, testShapesets, trialShapesets, make_shared_from_ref(offDiagonalTestTransformations()), make_shared_from_ref(offDiagonalKernels()), make_shared_from_ref(offDiagonalTrialTransformations()), make_shared_from_ref(offDiagonalIntegral()), openClHandler, parallelizationOptions, verbosityLevel, false /*cacheSingularIntegrals*/).release()); else result.second = result.first; return result; }
void FFT::cpx ( double *real , double *imag , int isign ) // Complex array { int i, factors[64] ; if (npts == 1) return ; for (i=0 ; i<n_facs ; i++) factors[i] = all_factors[i] ; kernels ( real , imag , ntot , npts , nspan , isign , n_facs , rwork , rwork+max_factor , rwork+2*max_factor , rwork+3*max_factor , factors ) ; permute ( real , imag , ntot , npts , nspan , abs(isign) , n_facs , n_sq_facs , rwork , rwork+max_factor , iwork , factors , max_factor ) ; }
BOOST_AUTO_TEST_CASE_TEMPLATE(evaluateOnGrid_works_for_points_on_y_axis, ValueType, kernel_types) { typedef Fiber::Laplace3dDoubleLayerPotentialKernelFunctor<ValueType> Functor; typedef Fiber::DefaultCollectionOfKernels<Functor> Kernels; Kernels kernels((Functor())); Fiber::GeometricalData<typename Functor::CoordinateType> testGeomData, trialGeomData; const int worldDim = 3; const int testPointCount = 2, trialPointCount = 3; // Note: points lying on y axis only testGeomData.globals.resize(worldDim, testPointCount); testGeomData.globals.fill(0.); testGeomData.globals(1, 1) = 1.; trialGeomData.globals.resize(worldDim, trialPointCount); trialGeomData.globals.fill(0.); trialGeomData.globals(1, 0) = 2.; trialGeomData.globals(1, 1) = 3.; trialGeomData.globals(1, 2) = 4.; trialGeomData.normals.resize(worldDim, trialPointCount); trialGeomData.normals.fill(0.); trialGeomData.normals(1, 0) = 1.; trialGeomData.normals(1, 1) = 1.; trialGeomData.normals(1, 2) = 1.; Fiber::CollectionOf4dArrays<ValueType> result; kernels.evaluateOnGrid(testGeomData, trialGeomData, result); Fiber::_4dArray<ValueType> expected(1, 1, testPointCount, trialPointCount); for (int trialPoint = 0; trialPoint < trialPointCount; ++trialPoint) for (int testPoint = 0; testPoint < testPointCount; ++testPoint) { typename Functor::CoordinateType diff = std::abs(testGeomData.globals(1, testPoint) - trialGeomData.globals(1, trialPoint)); expected(0, 0, testPoint, trialPoint) = -(1. / (4. * M_PI)) / diff / diff; } BOOST_CHECK(check_arrays_are_close<ValueType>(result[0], expected, 1e-6)); }
void SpicePosition::init() { erract_c("SET", 0, (char*)"IGNORE"); const char * k = "../cfg/kernels.furnsh"; furnsh_c(k); std::vector<std::string> ks; kernels(ks); for (std::vector<std::string>::iterator i = ks.begin(); i != ks.end(); i++) { getbodies(*i); } for (BodyConstantsConstIterator it = m_bodies.begin(); it != m_bodies.end(); it++) { EphemeridesInterval & interval = m_intervals[(*it)->id()]; getinterval(ks, (*it)->id(), interval); if (it == m_bodies.begin()) { m_interval.a = interval.a; m_interval.b = interval.b; } else { m_interval.a = std::max(m_interval.a, interval.a); m_interval.b = std::min(m_interval.b, interval.b); } } m_interval.a += 100; m_interval.b -= 100; char timea[1000], timeb[1000]; timout_c (m_interval.a, "YYYY ::TDB", sizeof(timea), timea); timout_c (m_interval.b, "YYYY ::TDB", sizeof(timeb), timeb); erract_c("SET", 0, (char*)"DEFAULT"); }
shared_ptr<vtkMeshModel> BuildGPModel(SEXP pPCA_,SEXP kernels_, SEXP ncomp_,SEXP nystroem_,SEXP useEmp_, SEXP combine_,SEXP combineEmp_, SEXP isoScale_, SEXP centroid_) { try { bool useEmp = as<bool>(useEmp_); int combine = as<int>(combine_); int combineEmp = as<int>(combineEmp_); double isoScale = as<double>(isoScale_); unsigned int nystroem = as<unsigned int>(nystroem_); unsigned int numberOfComponents = as<unsigned int>(ncomp_); std::list<MatrixValuedKernelType*> mKerns; std::list<GaussianKernel*> gKerns; std::list<MultiscaleKernel*> bsKerns; vtkPoint centroid = SEXP2vtkPoint(centroid_); List kernels(kernels_); shared_ptr<vtkMeshModel> model = pPCA2statismo(pPCA_); MatrixValuedKernelType* mvKernel; // set up the gaussian kernel to be incremented over a list of parameters NumericVector params = kernels[0]; //if params[0] == 0 Gaussian Kernel //else Multiscale kernel if (params.size() == 2) { GaussianKernel* gk = new GaussianKernel(params[0]); gKerns.push_back(gk); mvKernel = new UncorrelatedMatrixValuedKernel<vtkPoint>(gk, model->GetRepresenter()->GetDimensions()); } else { MultiscaleKernel* gk1 = new MultiscaleKernel(params[0],params[2]); bsKerns.push_back(gk1); mvKernel = new UncorrelatedMatrixValuedKernel<vtkPoint>(gk1, model->GetRepresenter()->GetDimensions()); } MatrixValuedKernelType* sumKernel = new ScaledKernel<vtkPoint>(mvKernel, params[1]); //iterate over the remaining kernel parameters for (unsigned int i = 1; i < kernels.size();i++) { params = kernels[i]; GaussianKernel* gkNew = new GaussianKernel(params[0]); MatrixValuedKernelType* mvGk = new UncorrelatedMatrixValuedKernel<vtkPoint>(gkNew, model->GetRepresenter()->GetDimensions()); MatrixValuedKernelType* scaledGk = new ScaledKernel<vtkPoint>(mvGk, params[1]); //keep track of allocated objects gKerns.push_back(gkNew); mKerns.push_back(mvGk); mKerns.push_back(scaledGk); if (combine == 0) sumKernel = new SumKernel<vtkPoint>(sumKernel, scaledGk); else sumKernel = new ProductKernel<vtkPoint>(sumKernel, scaledGk); } if (useEmp) { // get the empiric kernel MatrixValuedKernelType* statModelKernel = new StatisticalModelKernel<vtkPolyData>(model.get()); mKerns.push_back(statModelKernel); // add the empiric kernel on top if (combineEmp == 0) sumKernel = new SumKernel<vtkPoint>(sumKernel, statModelKernel); else sumKernel = new ProductKernel<vtkPoint>(sumKernel, statModelKernel); } if (isoScale > 0) { MatrixValuedKernelType* isoKernel = new IsoKernel(3,isoScale,centroid); mKerns.push_back(isoKernel); sumKernel = new SumKernel<vtkPoint>(sumKernel, isoKernel); } mKerns.push_back(sumKernel); //build new model shared_ptr<ModelBuilderType> modelBuilder(ModelBuilderType::Create(model->GetRepresenter())); shared_ptr<vtkMeshModel> combinedModel(modelBuilder->BuildNewModel(model->DrawMean(), *sumKernel, numberOfComponents,nystroem)); //tidy up for (std::list<MatrixValuedKernelType*>::iterator it = mKerns.begin(); it != mKerns.end(); it++) { if (*it != NULL) { delete *it; } } for (std::list<GaussianKernel*>::iterator it = gKerns.begin(); it != gKerns.end(); it++) { if (*it != NULL) { delete *it; } } for (std::list<MultiscaleKernel*>::iterator it = bsKerns.begin(); it != bsKerns.end(); it++) { if (*it != NULL) { delete *it; } } return combinedModel; } catch (StatisticalModelException& e) { ::Rf_error("Exception occured while building the shape model\n"); ::Rf_error("%s\n", e.what()); //shared_ptr<vtkMeshModel> model(NULL); //return model; } catch (std::exception& e) { ::Rf_error( e.what()); //shared_ptr<vtkMeshModel> model(NULL); //return model; } catch (...) { ::Rf_error("unknown exception"); //shared_ptr<vtkMeshModel> model(NULL); //return model; } }
BOOST_AUTO_TEST_CASE_TEMPLATE(evaluateOnGrid_agrees_with_evaluateAtPointPairs, ValueType, kernel_types) { typedef Fiber::Laplace3dDoubleLayerPotentialKernelFunctor<ValueType> Functor; typedef Fiber::DefaultCollectionOfKernels<Functor> Kernels; Kernels kernels((Functor())); typedef Fiber::GeometricalData<typename Functor::CoordinateType> GeomData; const int worldDim = 3; const int testPointCount = 2, trialPointCount = 3; // Collect data with evaluateOnGrid GeomData testGeomDataOnGrid, trialGeomDataOnGrid; testGeomDataOnGrid.globals.resize(worldDim, testPointCount); testGeomDataOnGrid.globals.fill(0.); testGeomDataOnGrid.globals(0, 1) = 1.; trialGeomDataOnGrid.globals.resize(worldDim, trialPointCount); trialGeomDataOnGrid.globals.fill(1.); trialGeomDataOnGrid.globals(0, 0) = 2.; trialGeomDataOnGrid.globals(0, 1) = 3.; trialGeomDataOnGrid.globals(0, 2) = 4.; trialGeomDataOnGrid.normals.resize(worldDim, trialPointCount); trialGeomDataOnGrid.normals.fill(0.); trialGeomDataOnGrid.normals(1, 0) = 1.; trialGeomDataOnGrid.normals(1, 1) = 1.; trialGeomDataOnGrid.normals(1, 2) = 1.; Fiber::CollectionOf4dArrays<ValueType> resultOnGrid; kernels.evaluateOnGrid(testGeomDataOnGrid, trialGeomDataOnGrid, resultOnGrid); Vector<ValueType> convertedResultOnGrid(testPointCount * trialPointCount); for (int testPoint = 0; testPoint < testPointCount; ++testPoint) for (int trialPoint = 0; trialPoint < trialPointCount; ++trialPoint) convertedResultOnGrid(testPoint + trialPoint * testPointCount) = resultOnGrid[0](0, 0, testPoint, trialPoint); // Collect data with evaluateAtPointPairs GeomData testGeomDataAtPointPairs, trialGeomDataAtPointPairs; testGeomDataAtPointPairs.globals.resize(worldDim, testPointCount * trialPointCount); trialGeomDataAtPointPairs.globals.resize(worldDim, testPointCount * trialPointCount); trialGeomDataAtPointPairs.normals.resize(worldDim, testPointCount * trialPointCount); for (int testPoint = 0; testPoint < testPointCount; ++testPoint) for (int trialPoint = 0; trialPoint < trialPointCount; ++trialPoint) { testGeomDataAtPointPairs.globals.col(testPoint + trialPoint * testPointCount) = testGeomDataOnGrid.globals.col(testPoint); trialGeomDataAtPointPairs.globals.col(testPoint + trialPoint * testPointCount) = trialGeomDataOnGrid.globals.col(trialPoint); trialGeomDataAtPointPairs.normals.col(testPoint + trialPoint * testPointCount) = trialGeomDataOnGrid.normals.col(trialPoint); } Fiber::CollectionOf3dArrays<ValueType> resultAtPointPairs; kernels.evaluateAtPointPairs(testGeomDataAtPointPairs, trialGeomDataAtPointPairs, resultAtPointPairs); Vector<ValueType> convertedResultAtPointPairs(testPointCount * trialPointCount); for (int point = 0; point < testPointCount * trialPointCount; ++point) convertedResultAtPointPairs(point) = resultAtPointPairs[0](0, 0, point); BOOST_CHECK(check_arrays_are_close<ValueType>( convertedResultAtPointPairs, convertedResultOnGrid, 1e-6)); }
/** * @brief Creates an array of objects containing the OpenCL variables of each device * @param trDataBase The training database which will contain the instances and the features * @param selInstances The instances choosen as initial centroids * @param transposedTrDataBase The training database already transposed * @param conf The structure with all configuration parameters * @return A pointer containing the objects */ CLDevice *createDevices(const float *const trDataBase, const int *const selInstances, const float *const transposedTrDataBase, Config *const conf) { /********** Find the OpenCL devices specified in configuration ***********/ // OpenCL variables cl_uint numPlatformsDevices; cl_device_type deviceType; cl_program program; cl_kernel kernel; cl_int status; // Others variables auto allDevices = getAllDevices(); CLDevice *devices = new CLDevice[conf -> nDevices + (conf -> ompThreads > 0)]; for (int dev = 0; dev < conf -> nDevices; ++dev) { bool found = false; for (int allDev = 0; allDev < allDevices.size() && !found; ++allDev) { // Get the specified OpenCL device char dbuff[120]; check(clGetDeviceInfo(allDevices[allDev], CL_DEVICE_NAME, sizeof(dbuff), dbuff, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_NAME); // If the device exists... if (conf -> devices[dev] == dbuff) { devices[dev].device = allDevices[allDev]; devices[dev].deviceName = dbuff; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_TYPE, sizeof(cl_device_type), &(devices[dev].deviceType), NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_TYPE); /********** Device local memory usage ***********/ long int usedMemory = conf -> nFeatures * sizeof(cl_uchar); // Chromosome of the individual usedMemory += conf -> trNInstances * sizeof(cl_uchar); // Mapping buffer usedMemory += conf -> K * conf -> nFeatures * sizeof(cl_float); // Centroids buffer usedMemory += conf -> trNInstances * sizeof(cl_float); // DistCentroids buffer usedMemory += conf -> K * sizeof(cl_int); // Samples_in_k buffer // Get the maximum local memory size long int maxMemory; check(clGetDeviceInfo(devices[dev].device, CL_DEVICE_LOCAL_MEM_SIZE, sizeof(long int), &maxMemory, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_MAXMEM); // Avoid exceeding the maximum local memory available. 1024 bytes of margin check(usedMemory > maxMemory - 1024, "%s:\n\tMax memory: %ld bytes\n\tAllow memory: %ld bytes\n\tUsed memory: %ld bytes\n", CL_ERROR_DEVICE_LOCALMEM, maxMemory, maxMemory - 1024, usedMemory); /********** Create context ***********/ devices[dev].context = clCreateContext(NULL, 1, &(devices[dev].device), 0, 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_CONTEXT); /********** Create Command queue ***********/ devices[dev].commandQueue = clCreateCommandQueue(devices[dev].context, devices[dev].device, CL_QUEUE_OUT_OF_ORDER_EXEC_MODE_ENABLE | CL_QUEUE_PROFILING_ENABLE, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_DEVICE_QUEUE); /********** Create kernel ***********/ // Open the file containing the kernels std::fstream kernels(conf -> kernelsFileName.c_str(), std::fstream::in); check(!kernels.is_open(), "%s\n", CL_ERROR_FILE_OPEN); // Obtain the size kernels.seekg(0, kernels.end); size_t fSize = kernels.tellg(); kernels.seekg(0, kernels.beg); char *kernelSource = new char[fSize]; kernels.read(kernelSource, fSize); kernels.close(); // Create program program = clCreateProgramWithSource(devices[dev].context, 1, (const char **) &kernelSource, &fSize, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_BUILD); // Build program for the device in the context char buildOptions[196]; sprintf(buildOptions, "-I include -D N_INSTANCES=%d -D N_FEATURES=%d -D N_OBJECTIVES=%d -D K=%d -D MAX_ITER_KMEANS=%d", conf -> trNInstances, conf -> nFeatures, conf -> nObjectives, conf -> K, conf -> maxIterKmeans); if (clBuildProgram(program, 1, &(devices[dev].device), buildOptions, 0, 0) != CL_SUCCESS) { char buffer[4096]; fprintf(stderr, "Error: Could not build the program\n"); check(clGetProgramBuildInfo(program, devices[dev].device, CL_PROGRAM_BUILD_LOG, sizeof(buffer), buffer, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_PROGRAM_ERRORS); check(true, "%s\n", buffer); } // Create kernel const char *kernelName = (devices[dev].deviceType == CL_DEVICE_TYPE_GPU) ? "kmeansGPU" : ""; devices[dev].kernel = clCreateKernel(program, kernelName, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_BUILD); /******* Work-items *******/ devices[dev].computeUnits = atoi(conf -> computeUnits[dev].c_str()); devices[dev].wiLocal = atoi(conf -> wiLocal[dev].c_str()); devices[dev].wiGlobal = devices[dev].computeUnits * devices[dev].wiLocal; /******* Create and write the databases and centroids buffers. Create the subpopulations buffer. Set kernel arguments *******/ // Create buffers devices[dev].objSubpopulations = clCreateBuffer(devices[dev].context, CL_MEM_READ_WRITE, conf -> familySize * sizeof(Individual), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_SUBPOPS); devices[dev].objTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TRDB); devices[dev].objTransposedTrDataBase = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_TTRDB); devices[dev].objSelInstances = clCreateBuffer(devices[dev].context, CL_MEM_READ_ONLY, conf -> K * sizeof(cl_int), 0, &status); check(status != CL_SUCCESS, "%s\n", CL_ERROR_OBJECT_CENTROIDS); // Sets kernel arguments check(clSetKernelArg(devices[dev].kernel, 0, sizeof(cl_mem), (void *)&(devices[dev].objSubpopulations)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT1); check(clSetKernelArg(devices[dev].kernel, 1, sizeof(cl_mem), (void *)&(devices[dev].objSelInstances)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT2); check(clSetKernelArg(devices[dev].kernel, 2, sizeof(cl_mem), (void *)&(devices[dev].objTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT3); check(clSetKernelArg(devices[dev].kernel, 5, sizeof(cl_mem), (void *)&(devices[dev].objTransposedTrDataBase)) != CL_SUCCESS, "%s\n", CL_ERROR_KERNEL_ARGUMENT6); // Write buffers check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), trDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TRDB); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objSelInstances, CL_FALSE, 0, conf -> K * sizeof(cl_int), selInstances, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_CENTROIDS); check(clEnqueueWriteBuffer(devices[dev].commandQueue, devices[dev].objTransposedTrDataBase, CL_FALSE, 0, conf -> trNInstances * conf -> nFeatures * sizeof(cl_float), transposedTrDataBase, 0, NULL, NULL) != CL_SUCCESS, "%s\n", CL_ERROR_ENQUEUE_TTRDB); // Resources used are released delete[] kernelSource; clReleaseProgram(program); found = true; allDevices.erase(allDevices.begin() + allDev); } } check(!found, "%s\n", CL_ERROR_DEVICE_FOUND); } /********** Add the CPU if has been enabled in configuration ***********/ if (conf -> ompThreads > 0) { devices[conf -> nDevices].deviceType = CL_DEVICE_TYPE_CPU; devices[conf -> nDevices].computeUnits = conf -> ompThreads; ++(conf -> nDevices); } return devices; }