void GolemMaterialH::GolemPropertiesH() { Real one_on_visc = 1.0 / _fluid_viscosity[_qp]; if (_fe_problem.isTransient()) (*_H_kernel_time)[_qp] = _porosity[_qp] / _Kf; _H_kernel[_qp] = computeKernel(_permeability[_qp], _permeability_type, one_on_visc, _current_elem->dim()); if (_current_elem->dim() < _mesh.dimension()) _H_kernel[_qp].rotate(_rotation_matrix); _H_kernel_grav[_qp] = -_fluid_density[_qp] * _gravity; }
void CueContrastKernel::calculateProbability(CSCVector* prob, float hval) { float width = (float)(mdp_cvcroppedimg[0]->width); float height = (float)(mdp_cvcroppedimg[0]->height); float centerx = ((float)(mdp_cvcroppedimg[0]->width) - 1.0) / 2.0; float centery = ((float)(mdp_cvcroppedimg[0]->height) - 1.0) / 2.0; float* probdata = prob->getData(); prob->setZero(); float Cinv = 0.0; unsigned int widthstepmask = mp_cvmaskimg->ipl->widthStep; char* maskdata = mp_cvmaskimg->ipl->imageData; for(int j = 0; j < height; j++ ){ for(int i = 0; i < width; i++ ){ unsigned char temp1 = (unsigned char)(maskdata[j*widthstepmask + i]); if( temp1 > 100 ){ float distx = (float)i - centerx; float disty = (float)j - centery; float normdistx = distx / (width*0.5*hval); float normdisty = disty / (height*0.5*hval); float norm_dist = sqrt(normdistx*normdistx + normdisty*normdisty); float temp2 = computeKernel(norm_dist); for(unsigned int k = 0;k<m_dimData;k++){ unsigned char* cropdata = (unsigned char*)(mdp_cvcroppedimg[k]->ipl->imageData); unsigned int widthstepcrop = mdp_cvcroppedimg[k]->ipl->widthStep; mp_pixeldata[k] = cropdata[j*widthstepcrop + i]; } unsigned int index = this->findIndex(mp_pixeldata); probdata[index] = probdata[index] + temp2; Cinv = Cinv + temp2; } } } // Normalization if(Cinv != 0.0) { for(int i = 0; i < m_totalnumbins; i++ ){ probdata[i] /= Cinv; } } }
//------------------------------------------------------------------------------ int main(int argc, char** argv) { if(argc < 5) { std::cout << "usage: " << argv[0] << " <platform id(0, 1, ...)>" " <device type: default | cpu | gpu | acc>" " <device id(0, 1, ...)>" " <number of double prec. elements>\n"; exit(EXIT_FAILURE); } std::vector<cl::Platform> platforms; std::vector<cl::Device> devices; const int platformID = atoi(argv[1]); cl_device_type deviceType; const std::string kernelName(argv[4]); const std::string dt = std::string(argv[2]); if(dt == "default") deviceType = CL_DEVICE_TYPE_DEFAULT; else if(dt == "cpu") deviceType = CL_DEVICE_TYPE_CPU; else if(dt == "gpu") deviceType = CL_DEVICE_TYPE_GPU; else if(dt == "acc") deviceType = CL_DEVICE_TYPE_ACCELERATOR; else { std::cerr << "ERROR - unrecognized device type " << dt << std::endl; exit(EXIT_FAILURE); } const int deviceID = atoi(argv[3]); const size_t SIZE = atoll(argv[4]); const size_t BYTE_SIZE = SIZE * sizeof(real_t); // init MPI environment MPI_Init(&argc, &argv); int task = -1; MPI_Comm_rank(MPI_COMM_WORLD, &task); try { //OpenCL init cl::Platform::get(&platforms); if(platforms.size() <= platformID) { std::cerr << "Platform id " << platformID << " is not available\n"; exit(EXIT_FAILURE); } platforms[platformID].getDevices(deviceType, &devices); cl::Context context(devices); cl::CommandQueue queue(context, devices[deviceID], CL_QUEUE_PROFILING_ENABLE); std::vector< real_t > data(SIZE, -1); //device buffer #1: holds local data cl::Buffer devData(context, CL_MEM_READ_WRITE | CL_MEM_ALLOC_HOST_PTR //<-- page locked memory | CL_MEM_COPY_HOST_PTR, //<-- copy data from 'data' BYTE_SIZE, const_cast< double* >(&data[0])); //device buffer #2: holds data received from other node cl::Buffer devRecvData(context, CL_MEM_READ_ONLY | CL_MEM_ALLOC_HOST_PTR, BYTE_SIZE); //process data on the GPU(set array elements to local MPI id) const char CLCODE_INIT[] = "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "typedef double real_t;\n" "__kernel void arrayset(__global real_t* outputArray,\n" " real_t value) {\n" "//get global thread id for dimension 0\n" "const int id = get_global_id(0);\n" "outputArray[id] = value;\n" "}"; cl::Program::Sources initSource(1, std::make_pair(CLCODE_INIT, sizeof(CLCODE_INIT))); cl::Program initProgram(context, initSource); initProgram.build(devices); cl::Kernel initKernel(initProgram, "arrayset"); initKernel.setArg(0, devData); initKernel.setArg(1, real_t(task)); queue.enqueueNDRangeKernel(initKernel, cl::NDRange(0), cl::NDRange(SIZE), cl::NDRange(1)); //perform data exchange: //1) map device buffers to host memory void* sendHostPtr = queue.enqueueMapBuffer(devData, CL_FALSE, CL_MAP_READ, 0, BYTE_SIZE); if(sendHostPtr == 0) throw std::runtime_error("NULL mapped ptr"); void* recvHostPtr = queue.enqueueMapBuffer(devRecvData, CL_FALSE, CL_MAP_WRITE, 0, BYTE_SIZE); if(recvHostPtr == 0) throw std::runtime_error("NULL mapped ptr"); queue.finish(); //2) copy data to from remote process const int tag0to1 = 0x01; const int tag1to0 = 0x10; MPI_Request send_req; MPI_Request recv_req; int source = -1; int dest = -1; if(task == 0 ) { source = 1; dest = 1; } else { source = 0; dest = 0; } MPI_Status status; if(task == 0) { MPI_Isend(sendHostPtr, SIZE, MPI_DOUBLE, dest, tag0to1, MPI_COMM_WORLD, &send_req); MPI_Irecv(recvHostPtr, SIZE, MPI_DOUBLE, source, tag1to0, MPI_COMM_WORLD, &recv_req); } else { MPI_Isend(sendHostPtr, SIZE, MPI_DOUBLE, dest, tag1to0, MPI_COMM_WORLD, &send_req); MPI_Irecv(recvHostPtr, SIZE, MPI_DOUBLE, source, tag0to1, MPI_COMM_WORLD, &recv_req); } //3) as soon as data is copied do unmap buffers, indirectlry // triggering a host --> device copy MPI_Wait(&recv_req, &status); queue.enqueueUnmapMemObject(devRecvData, recvHostPtr); MPI_Wait(&send_req, &status); queue.enqueueUnmapMemObject(devData, sendHostPtr); //note that instead of having each process compile the code //you could e.g. send the size and content of the source buffer //to each process from root; or even send the precompiled code, //in this case all nodes of the clusted must be the same whereas //in the case of source code compilation hybrid systems are //automatically supported by OpenCL //process data on the GPU: increment local data array with value //received from other process const char CLCODE_COMPUTE[] = "#pragma OPENCL EXTENSION cl_khr_fp64: enable\n" "typedef double real_t;\n" "__kernel void sum( __global const real_t* in,\n" " __global real_t* inout) {\n" "const int id = get_global_id(0);\n" "inout[id] += in[id];\n" "}"; cl::Program::Sources computeSource(1, std::make_pair(CLCODE_COMPUTE, sizeof(CLCODE_COMPUTE))); cl::Program computeProgram(context, computeSource); computeProgram.build(devices); cl::Kernel computeKernel(computeProgram, "sum"); computeKernel.setArg(0, devRecvData); computeKernel.setArg(1, devData); queue.enqueueNDRangeKernel(computeKernel, cl::NDRange(0), cl::NDRange(SIZE), cl::NDRange(1)); //map device data to host memory for validation and output real_t* computedDataHPtr = reinterpret_cast< real_t* >( queue.enqueueMapBuffer(devData, CL_FALSE, CL_MAP_READ, 0, BYTE_SIZE)); if(computedDataHPtr == 0) throw std::runtime_error("NULL mapped ptr"); queue.finish(); const int value = 1; // task id 0 + task id 1 const std::vector< real_t > reference(SIZE, value); if(std::equal(computedDataHPtr, computedDataHPtr + SIZE, reference.begin())) { std::cout << '[' << task << "]: PASSED" << std::endl; } else { std::cout << '[' << task << "]: FAILED" << std::endl; } //release mapped pointer queue.enqueueUnmapMemObject(devData, computedDataHPtr); //release MPI resources MPI_Finalize(); } catch(cl::Error e) { std::cerr << e.what() << ": Error code " << e.err() << std::endl; MPI_Finalize(); exit(EXIT_FAILURE); } return 0; }
int main(int argc, char **argv) { /* Command-line argument default values */ char *filenameIn = NULL; char *filenameOut = NULL; double radius = 1; int x = 0, y = 0, size = 80, kernelSize = 5; if (!parseArgs(argc, argv, &filenameIn, &filenameOut, &x, &y, &size, &kernelSize, &radius)) { return 1; } if (size < kernelSize) { printf("Size too small.\n"); return 1; } /* Load an image into memory, and set aside memory for result */ int width, height, comp; uint8_t *pixelsIn = stbi_load(filenameIn, &width, &height, &comp, 0); uint8_t *pixelsOut = (uint8_t *) malloc( height * width * comp * sizeof(uint8_t *)); if (pixelsIn == NULL) { printf("Could not load \"%s\".\n", filenameIn); free(pixelsOut); return 1; } if (pixelsOut == NULL) { printf("Could not allocate enough memory.\n"); free(pixelsIn); return 1; } memcpy(pixelsOut, pixelsIn, height * width * comp * sizeof(uint8_t)); /* Clamp x and y to available space */ x = x > width ? width : x; y = y > height ? height : y; double *kernel = (double *) malloc(kernelSize * kernelSize * sizeof(double)); double sum = computeKernel(kernel, kernelSize, radius // sigma ); normalise(kernel, sum, kernelSize, kernelSize); convolve(width, height, x, // Define box y, // x + size, // y + size, // comp, // components pixelsIn, // in pixelsOut, // out kernel, // kernel kernelSize // kernelSize ); if (stbi_write_png(filenameOut, width, height, comp, pixelsOut, 0) == 0) { printf("Could not write \"%s\"\n", filenameOut); } else { printf("Wrote: %s (%ix%ix%i) " \ "(x=%i, y=%i, size=%i) " \ "to %s\n", filenameIn, height, width, x, y, size, comp, filenameOut); } free(pixelsIn); free(pixelsOut); free(filenameIn); free(filenameOut); return 0; }