bool OsdCLKernelBundle::Compile(cl_context clContext, int numVertexElements, int numVaryingElements) { cl_int ciErrNum; _numVertexElements = numVertexElements; _numVaryingElements = numVaryingElements; char constantDefine[256]; snprintf(constantDefine, sizeof(constantDefine), "#define NUM_VERTEX_ELEMENTS %d\n" "#define NUM_VARYING_ELEMENTS %d\n", numVertexElements, numVaryingElements); const char *sources[] = { constantDefine, clSource }; _clProgram = clCreateProgramWithSource(clContext, 2, sources, 0, &ciErrNum); CL_CHECK_ERROR(ciErrNum, "clCreateProgramWithSource\n"); ciErrNum = clBuildProgram(_clProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { OsdError(OSD_CL_PROGRAM_BUILD_ERROR, "CLerr=%d", ciErrNum); cl_int numDevices = 0; clGetContextInfo(clContext, CL_CONTEXT_NUM_DEVICES, sizeof(cl_uint), &numDevices, NULL); cl_device_id *devices = new cl_device_id[numDevices]; clGetContextInfo(clContext, CL_CONTEXT_DEVICES, sizeof(cl_device_id)*numDevices, devices, NULL); for (int i = 0; i < numDevices; ++i) { char cBuildLog[10240]; clGetProgramBuildInfo(_clProgram, devices[i], CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL); OsdError(OSD_CL_PROGRAM_BUILD_ERROR, cBuildLog); } delete[] devices; return false; } _clBilinearEdge = buildKernel(_clProgram, "computeBilinearEdge"); _clBilinearVertex = buildKernel(_clProgram, "computeBilinearVertex"); _clCatmarkFace = buildKernel(_clProgram, "computeFace"); _clCatmarkEdge = buildKernel(_clProgram, "computeEdge"); _clCatmarkVertexA = buildKernel(_clProgram, "computeVertexA"); _clCatmarkVertexB = buildKernel(_clProgram, "computeVertexB"); _clLoopEdge = buildKernel(_clProgram, "computeEdge"); _clLoopVertexA = buildKernel(_clProgram, "computeVertexA"); _clLoopVertexB = buildKernel(_clProgram, "computeLoopVertexB"); _clVertexEditAdd = buildKernel(_clProgram, "editVertexAdd"); return true; }
kernel device::buildKernelFromString(const std::string &content, const std::string &kernelName, const occa::properties &props) const { occa::properties allProps = props + kernelProperties(); allProps["mode"] = mode(); hash_t kernelHash = (hash() ^ occa::hash(allProps) ^ occa::hash(content)); io::lock_t lock(kernelHash, "occa-device"); std::string stringSourceFile = io::hashDir(kernelHash); stringSourceFile += "stringSource.okl"; if (lock.isMine()) { if (!sys::fileExists(stringSourceFile)) { io::write(stringSourceFile, content); } lock.release(); } return buildKernel(stringSourceFile, kernelName, props); }
bool DeconvolutionTool::doDeconvolution(QImage *inputImage, QImage *outputImage, Blur* blur) { isProcessingCancelled = false; // Create kernel buildKernel(kernelMatrix, width, height, blur); fftw_execute(realForwardKernelPlan); // Fill processingContext ProcessingContext* processingContext = new ProcessingContext(); processingContext->inputImage = inputImage; processingContext->outputImage = outputImage; processingContext->inputImageMatrix = inputImageMatrix; processingContext->outputImageMatrix = outputImageMatrix; processingContext->kernelFFT = kernelFFT; processingContext->width = width; processingContext->height = height; processingContext->blur = blur; if (blur->mode == PREVIEW_GRAY) { doDeconvolutionForChannel(processingContext, GRAY); } else { QString progressText= "High-Quality"; if (blur->mode == PREVIEW_COLOR) { progressText = "Color Preview"; } setProgressInterval(10,40, progressText); doDeconvolutionForChannel(processingContext, RED); setProgressInterval(40,70, progressText); doDeconvolutionForChannel(processingContext, GREEN); setProgressInterval(70,100, progressText); doDeconvolutionForChannel(processingContext, BLUE); } delete(processingContext); return !isProcessingCancelled; }
void OpenCL::init(int isGPU) { if (isGPU) getDevices(CL_DEVICE_TYPE_GPU); else getDevices(CL_DEVICE_TYPE_CPU); buildKernel(); }
PTEX_NAMESPACE_BEGIN void PtexTriangleFilter::eval(float* result, int firstChan, int nChannels, int faceid, float u, float v, float uw1, float vw1, float uw2, float vw2, float width, float blur) { // init if (!_tx || nChannels <= 0) return; if (faceid < 0 || faceid >= _tx->numFaces()) return; _ntxchan = _tx->numChannels(); _dt = _tx->dataType(); _firstChanOffset = firstChan*DataSize(_dt); _nchan = PtexUtils::min(nChannels, _ntxchan-firstChan); // get face info const FaceInfo& f = _tx->getFaceInfo(faceid); // if neighborhood is constant, just return constant value of face if (f.isNeighborhoodConstant()) { PtexPtr<PtexFaceData> data ( _tx->getData(faceid, 0) ); if (data) { char* d = (char*) data->getData() + _firstChanOffset; Ptex::ConvertToFloat(result, d, _dt, _nchan); } return; } // clamp u and v u = PtexUtils::clamp(u, 0.0f, 1.0f); v = PtexUtils::clamp(v, 0.0f, 1.0f); // build kernel PtexTriangleKernel k; buildKernel(k, u, v, uw1, vw1, uw2, vw2, width, blur, f.res); // accumulate the weight as we apply _weight = 0; // allocate temporary result _result = (float*) alloca(sizeof(float)*_nchan); memset(_result, 0, sizeof(float)*_nchan); // apply to faces splitAndApply(k, faceid, f); // normalize (both for data type and cumulative kernel weight applied) // and output result float scale = 1.0f / (_weight * OneValue(_dt)); for (int i = 0; i < _nchan; i++) result[i] = float(_result[i] * scale); // clear temp result _result = 0; }
void device::loadKernels(const std::string &library) { // TODO 1.1: Load kernels #if 0 std::string devHash = hash().toFullString(); strVector dirs = io::directories("occa://" + library); const int dirCount = (int) dirs.size(); int kernelsLoaded = 0; for (int d = 0; d < dirCount; ++d) { const std::string buildFile = dirs[d] + kc::buildFile; if (!sys::fileExists(buildFile)) { continue; } json info = json::read(buildFile)["info"]; if ((std::string) info["device/hash"] != devHash) { continue; } ++kernelsLoaded; const std::string sourceFilename = dirs[d] + kc::parsedSourceFile; json &kInfo = info["kernel"]; hash_t hash = hash_t::fromString(kInfo["hash"]); jsonArray metadataArray = kInfo["metadata"].array(); occa::properties kernelProps = kInfo["props"]; // Ignore how the kernel was setup, turn off verbose kernelProps["verbose"] = false; const int kernels = metadataArray.size(); for (int k = 0; k < kernels; ++k) { buildKernel(sourceFilename, hash, kernelProps, lang::kernelMetadata::fromJson(metadataArray[k])); } } // Print loaded info if (properties().get("verbose", false) && kernelsLoaded) { std::cout << "Loaded " << kernelsLoaded; if (library.size()) { std::cout << " ["<< library << "]"; } else { std::cout << " cached"; } std::cout << ((kernelsLoaded == 1) ? " kernel\n" : " kernels\n"); } #endif }
bool Engine::start() { bool retval {false}; m_media = std::make_unique<Media>(); if (m_media->init()) { buildKernel(); if (buildModules()) { m_kernel->start(); retval = true; } } return retval; }
bool OsdClKernelDispatcher::ClKernel::Compile(cl_context clContext, int numVertexElements, int numVaryingElements) { cl_int ciErrNum; _numVertexElements = numVertexElements; _numVaryingElements = numVaryingElements; char constantDefine[256]; snprintf(constantDefine, 256, "#define NUM_VERTEX_ELEMENTS %d\n" "#define NUM_VARYING_ELEMENTS %d\n", numVertexElements, numVaryingElements); const char *sources[] = { constantDefine, clSource }; _clProgram = clCreateProgramWithSource(clContext, 2, sources, 0, &ciErrNum); CL_CHECK_ERROR(ciErrNum, "clCreateProgramWithSource\n"); ciErrNum = clBuildProgram(_clProgram, 0, NULL, NULL, NULL, NULL); if (ciErrNum != CL_SUCCESS) { OSD_ERROR("ERROR in clBuildProgram %d\n", ciErrNum); char cBuildLog[10240]; clGetProgramBuildInfo(_clProgram, _clDevice, CL_PROGRAM_BUILD_LOG, sizeof(cBuildLog), cBuildLog, NULL); OSD_ERROR(cBuildLog); return false; } // ------- _clBilinearEdge = buildKernel(_clProgram, "computeBilinearEdge"); _clBilinearVertex = buildKernel(_clProgram, "computeBilinearVertex"); _clCatmarkFace = buildKernel(_clProgram, "computeFace"); _clCatmarkEdge = buildKernel(_clProgram, "computeEdge"); _clCatmarkVertexA = buildKernel(_clProgram, "computeVertexA"); _clCatmarkVertexB = buildKernel(_clProgram, "computeVertexB"); _clLoopEdge = buildKernel(_clProgram, "computeEdge"); _clLoopVertexA = buildKernel(_clProgram, "computeVertexA"); _clLoopVertexB = buildKernel(_clProgram, "computeLoopVertexB"); return true; }
VolumeMaxCLProcessor::VolumeMaxCLProcessor() : Processor() , ProcessorKernelOwner(this) , inport_("inVolume") , outport_("outVolume") , volumeRegionSize_("region", "Region size", 8, 1, 100) , workGroupSize_("wgsize", "Work group size", ivec3(8), ivec3(0), ivec3(256)) , useGLSharing_("glsharing", "Use OpenGL sharing", true) , supportsVolumeWrite_(false) , tmpVolume_(nullptr) , kernel_(nullptr) { addPort(inport_); addPort(outport_); addProperty(volumeRegionSize_); addProperty(workGroupSize_); addProperty(useGLSharing_); buildKernel(); }
///apply filter effect to the PixelBuffer* buffer passed into this function void FBlur::applyFilter(PixelBuffer* imageBuffer){ // if kernel is already initialized, do not need initialize it again. kernel = buildKernel(std::round(getFloatParameter())); // printKernel(); if(getName() == "FEdgeDetection"){ imageBuffer -> convertToLuminance(); } int width = imageBuffer -> getWidth(); int height = imageBuffer -> getHeight(); //create a new pixel buffer for storing the convolution result. PixelBuffer* newImageBuffer = new PixelBuffer(width, height, imageBuffer -> getBackgroundColor()); for(int i = 0; i < width; i++){ for(int j = 0; j < height; j++){ float newRed = 0; float newGreen = 0; float newBlue = 0; for(size_t filterI = 0; filterI < kernel.size(); filterI++){ for(size_t filterJ = 0; filterJ < kernel[filterI].size(); filterJ++){ //The location imageI and imageJ is calculated so that //for the center element of the filter it'll be i, j int imageI = (i - kernel.size()/2 + filterI + width) % width; int imageJ = (j - kernel[filterI].size()/2 + filterJ + height) % height; ColorData currPixel = imageBuffer -> getPixel(imageI, imageJ); newRed += currPixel.getRed() * kernel[filterI][filterJ]; newGreen += currPixel.getGreen()*kernel[filterI][filterJ]; newBlue += currPixel.getBlue()*kernel[filterI][filterJ]; } } ColorData newPixel = ColorData(newRed, newGreen, newBlue); newPixel = newPixel.clampedColor(); newImageBuffer -> setPixel(i, j, newPixel); } } newImageBuffer -> copyPixelBuffer(newImageBuffer, imageBuffer); delete newImageBuffer; }
JNIEXPORT jint JNICALL Java_pt_floraon_ecospace_nativeFunctions_computeKernelDensities(JNIEnv *env, jclass obj, jstring filename, jstring outfilename, jintArray variables,jint freqthresh,jfloat sigmapercent,jboolean downweight) { // NOTE: "variables" must be in increasing order! 0 is latitude, 1 is longitude, 2... are the other climatic variables const char *pfilename=(*env)->GetStringUTFChars(env, filename , NULL ); const char *poutfilename=(*env)->GetStringUTFChars(env, outfilename , NULL ); FILE *varsfile,*densfile,*freqfile; int nvars,i,j,k,*freqs,ntaxafiltered=0,*mapIDs,*outIDs,d1,d2,d1from,d1to,d2from,d2to,d3from,d3to,d1kern,d2kern,sidesq; int lastID=-1,d2p,kernelhalfside,kernelside,kernelsidesq; register int d3p,d3; register float *d3kern; jsize nrecs; jint ntaxa,*pIDs; VARIABLE *vararray; VARIABLEHEADER *varheader; int nvarstouse=(int)(*env)->GetArrayLength(env,variables); // how many vars will be used for kernel density jint *pvariables=(*env)->GetIntArrayElements(env, variables, 0); float sigma; float *kernel,*tmpdens; unsigned long *weight; DENSITY *densities; bool anythingtosave=false,skiprec; size_t dummy; varsfile=fopen(STANDARDVARIABLEFILE(pfilename),"r"); dummy=fread(&nrecs,sizeof(jsize),1,varsfile); dummy=fread(&ntaxa,sizeof(jint),1,varsfile); dummy=fread(&nvars,sizeof(int),1,varsfile); // fseek(varsfile,nvars*sizeof(tmp.filename)+sizeof(long)*ntaxa,SEEK_CUR); fseek(varsfile,sizeof(long)*ntaxa,SEEK_CUR); // skip index { // redirect stdout to file int fd; fpos_t pos; fflush(stdout); fgetpos(stdout, &pos); fd = dup(fileno(stdout)); FILE *dummy=freopen("logfile.txt", "a", stdout); } vararray=malloc(nvarstouse*nrecs*sizeof(VARIABLE)); varheader=malloc(nvarstouse*sizeof(VARIABLEHEADER)); pIDs=malloc(nrecs*sizeof(jint)); dummy=fread(pIDs,sizeof(jint),nrecs,varsfile); fseek(varsfile,2*sizeof(jfloat)*nrecs,SEEK_CUR); // skip original coordinates weight=malloc(sizeof(long)*nrecs); dummy=fread(weight,sizeof(long),nrecs,varsfile); for(i=0;i<nvarstouse;i++) { fseek(varsfile,(sizeof(VARIABLEHEADER) + sizeof(VARIABLE)*nrecs)*(pvariables[i]-(i==0 ? 0 : (pvariables[i-1]+1))),SEEK_CUR); dummy=fread(&varheader[i],sizeof(VARIABLEHEADER),1,varsfile); printf("Variable %d: min %f max %f\n",pvariables[i],varheader[i].min,varheader[i].max); dummy=fread(&vararray[i*nrecs],sizeof(VARIABLE),nrecs,varsfile); } // count the # of records of each taxon freqs=calloc(ntaxa,sizeof(int)); for(i=0;i<nrecs;i++) freqs[pIDs[i]]++; // write out frequencies in a text file (for java) freqfile=fopen(FREQANALYSISFILE(pfilename),"w"); for(i=0;i<ntaxa;i++) fprintf(freqfile,"%d\n",freqs[i]); fclose(freqfile); // count the # of taxa after filtering out rarest for(i=0;i<ntaxa;i++) if(freqs[i] >= freqthresh) ntaxafiltered++; // create a mapping of IDs: because some IDs were removed, make them sequential without holes (remember that memory is the limiting factor here!) mapIDs=malloc(ntaxa*sizeof(int)); for(i=0;i<ntaxa;i++) mapIDs[i]=-1; for(i=0,j=0;i<nrecs;i++) { if(freqs[pIDs[i]] >= freqthresh) { if(mapIDs[pIDs[i]]==-1) { mapIDs[pIDs[i]]=j; j++; } } } //for(i=0;i<ntaxa;i++) printf("%d ",mapIDs[i]); // compute the resolution of the multidimensional space so that not too much memory is occupied side=(int)pow((float)(MAXMEMORYPERBATCH)/(float)ntaxafiltered,(float)1/nvarstouse); if(side>MAXSIDE) side=MAXSIDE; //side=40; sidesq=side*side; sigma=side*sigmapercent; arraysize=(int)pow(side,nvarstouse); printf("Using a grid with a side of %d cells, in %d variables (dimensions).\n",side,nvarstouse); printf("Reading %d variables of %d records of %d taxa (after filtering out those with less than %d occurrences)...\n",nvars,nrecs,ntaxafiltered,freqthresh); // build the kernel for this case kernel=buildKernel(side,sigma,nvarstouse,&kernelhalfside,&kernelside,&kernelsidesq); // compute densities // allocate N multidimensional arrays (multidimensional grids to compute kernel densities in each cell) densities=malloc(ntaxafiltered*sizeof(DENSITY)); outIDs=calloc(ntaxafiltered,sizeof(int)); for(i=0;i<ntaxafiltered;i++) { densities[i].density=malloc(arraysize); memset(densities[i].density,0,arraysize); } printf("Computing kernel densities");fflush(stdout); // scale the variables to the size of the grid for(i=0;i<nrecs;i++) { for(k=0;k<nvarstouse;k++) { if(vararray[i+nrecs*k]!=RASTERNODATA) vararray[i+nrecs*k]=(vararray[i+nrecs*k]*side)/10000; } } #pragma omp parallel private(i,k,skiprec,tmpdens,anythingtosave,d1from,d1to,d1,d1kern,d2,d2from,d2to,d2p,d2kern,d3,d3from,d3to,d3p,d3kern) { tmpdens=malloc(arraysize*sizeof(float)); memset(tmpdens,0,arraysize*sizeof(float)); #pragma omp for for(j=0;j<ntaxa;j++) { // NOTE: this loop doesn't need the records to be sorted, that's why it takes much longer // TODO: we might have an index here, i.e. for each taxon, a list of tthe respective records, but it's so fast that maybe it's not a big issue, we're talking about a few thousands of taxa only. if(freqs[j]<freqthresh) continue; anythingtosave=false; for(i=0;i<nrecs;i++) { // iterate through all records in search for this taxon ACK!! give me an index if(pIDs[i]!=j) continue; for(k=0,skiprec=false;k<nvarstouse;k++) { // check if any one of the variables is NA. if it is, skip this record if(vararray[i+nrecs*k] == RASTERNODATA) { skiprec=true; break; } } if(skiprec) continue; // skip NAs // now yeah, create density surface by summing the kernels record by record // since it is not computationally feasible more than 3 dimensions, just make the optimized code for each case... switch(nvarstouse) { case 1: d1from=(vararray[i]-kernelhalfside)<0 ? 0 : (vararray[i]-kernelhalfside); d1to=(vararray[i]+kernelhalfside+1>side ? side : vararray[i]+kernelhalfside+1); for(d1=d1from,d1kern=vararray[i]-kernelhalfside<0 ? kernelhalfside-vararray[i] : 0;d1<d1to;d1++,d1kern++) { tmpdens[d1]+=kernel[d1kern] * (downweight ? ((float)weight[i]/MULTIPLIER) : 1); } anythingtosave=true; break; case 2: d1from=(vararray[i]-kernelhalfside)<0 ? 0 : (vararray[i]-kernelhalfside); d1to=(vararray[i]+kernelhalfside+1>side ? side : vararray[i]+kernelhalfside+1); d2from=(vararray[i+nrecs]-kernelhalfside)<0 ? 0 : (vararray[i+nrecs]-kernelhalfside); d2to=(vararray[i+nrecs]+kernelhalfside+1>side ? side : vararray[i+nrecs]+kernelhalfside+1); for(d1=d1from,d1kern=vararray[i]-kernelhalfside<0 ? kernelhalfside-vararray[i] : 0;d1<d1to;d1++,d1kern++) { for(d2=d2from,d2p=d1+d2from*side,d2kern=d1kern+((vararray[i+nrecs]-kernelhalfside)<0 ? (kernelhalfside-vararray[i+nrecs])*kernelside : 0);d2<d2to;d2++,d2p+=side,d2kern+=kernelside) { tmpdens[d2p]+=kernel[d2kern] * (downweight ? ((float)weight[i]/MULTIPLIER) : 1); } } anythingtosave=true; break; case 3: d1from=(vararray[i]-kernelhalfside)<0 ? 0 : (vararray[i]-kernelhalfside); d1to=(vararray[i]+kernelhalfside+1>side ? side : vararray[i]+kernelhalfside+1); d2from=(vararray[i+nrecs]-kernelhalfside)<0 ? 0 : (vararray[i+nrecs]-kernelhalfside); d2to=(vararray[i+nrecs]+kernelhalfside+1>side ? side : vararray[i+nrecs]+kernelhalfside+1); d3from=(vararray[i+nrecs*2]-kernelhalfside)<0 ? 0 : (vararray[i+nrecs*2]-kernelhalfside); d3to=(vararray[i+nrecs*2]+kernelhalfside+1>side ? side : vararray[i+nrecs*2]+kernelhalfside+1); for(d1=d1from,d1kern=vararray[i]-kernelhalfside<0 ? kernelhalfside-vararray[i] : 0; d1<d1to; d1++,d1kern++) { for(d2=d2from,d2p=d1+d2from*side,d2kern=d1kern+((vararray[i+nrecs]-kernelhalfside)<0 ? (kernelhalfside-vararray[i+nrecs])*kernelside : 0) ;d2<d2to ;d2++,d2p+=side,d2kern+=kernelside) { for(d3=d3from,d3p=d2p+d3from*sidesq ,d3kern=&kernel[d2kern+((vararray[i+nrecs*2]-kernelhalfside)<0 ? (kernelhalfside-vararray[i+nrecs*2])*kernelsidesq : 0)] ;d3<d3to ;d3++,d3p+=sidesq,d3kern+=kernelsidesq) { tmpdens[d3p]+=*d3kern * (downweight ? ((float)weight[i]/MULTIPLIER) : 1); //kernel[d3kern]; } } } anythingtosave=true; break; } } // end record loop if(anythingtosave) { saveKernelDensity(tmpdens,freqs[j],&densities[mapIDs[j]]); // save kernel density of previous taxon outIDs[mapIDs[j]]=j; } else { saveKernelDensity(NULL, freqs[j], &densities[mapIDs[j]]); // save kernel density of previous taxon outIDs[mapIDs[j]]=j; // outIDs[mapIDs[j]]=-1; } anythingtosave = false; memset(tmpdens,0,arraysize*sizeof(float)); printf("."); fflush(stdout); } // end taxon loop } /* THIS is the working code. Above still developing. for(i=0;i<nrecs;i++) { // iterate through all records IMPORTANT: records must be sorted by taxon ID if(freqs[pIDs[i]]>=freqthresh) { if(pIDs[i]!=lastID) { // this record is already a different species if(anythingtosave) { saveKernelDensity(tmpdens,freqs[lastID],&densities[mapIDs[lastID]]); // save kernel density of previous taxon outIDs[mapIDs[lastID]]=lastID; } else outIDs[mapIDs[lastID]]=-1; anythingtosave=false; memset(tmpdens,0,arraysize*sizeof(float)); lastID=pIDs[i]; printf("."); fflush(stdout); } // scale the variables to the size of the grid for(j=0,skiprec=false;j<nvarstouse;j++) { if(vararray[i+nrecs*j]==RASTERNODATA) { skiprec=true; continue; } else vararray[i+nrecs*j]=(vararray[i+nrecs*j]*side)/10000; } if(skiprec) continue; // skip NAs // now yeah, create density surface by summing the kernels record by record // since it is not computationally feasible more than 3 dimensions, just make the optimized code for each case... switch(nvarstouse) { case 1: d1from=(vararray[i]-kernelhalfside)<0 ? 0 : (vararray[i]-kernelhalfside); d1to=(vararray[i]+kernelhalfside+1>side ? side : vararray[i]+kernelhalfside+1); for(d1=d1from,d1kern=vararray[i]-kernelhalfside<0 ? kernelhalfside-vararray[i] : 0;d1<d1to;d1++,d1kern++) { tmpdens[d1]+=kernel[d1kern] * (downweight ? ((float)weight[i]/MULTIPLIER) : 1); } anythingtosave=true; break; case 2: d1from=(vararray[i]-kernelhalfside)<0 ? 0 : (vararray[i]-kernelhalfside); d1to=(vararray[i]+kernelhalfside+1>side ? side : vararray[i]+kernelhalfside+1); d2from=(vararray[i+nrecs]-kernelhalfside)<0 ? 0 : (vararray[i+nrecs]-kernelhalfside); d2to=(vararray[i+nrecs]+kernelhalfside+1>side ? side : vararray[i+nrecs]+kernelhalfside+1); for(d1=d1from,d1kern=vararray[i]-kernelhalfside<0 ? kernelhalfside-vararray[i] : 0;d1<d1to;d1++,d1kern++) { for(d2=d2from,d2p=d1+d2from*side,d2kern=d1kern+((vararray[i+nrecs]-kernelhalfside)<0 ? (kernelhalfside-vararray[i+nrecs])*kernelside : 0);d2<d2to;d2++,d2p+=side,d2kern+=kernelside) { tmpdens[d2p]+=kernel[d2kern] * (downweight ? ((float)weight[i]/MULTIPLIER) : 1); } } anythingtosave=true; break; case 3: d1from=(vararray[i]-kernelhalfside)<0 ? 0 : (vararray[i]-kernelhalfside); d1to=(vararray[i]+kernelhalfside+1>side ? side : vararray[i]+kernelhalfside+1); d2from=(vararray[i+nrecs]-kernelhalfside)<0 ? 0 : (vararray[i+nrecs]-kernelhalfside); d2to=(vararray[i+nrecs]+kernelhalfside+1>side ? side : vararray[i+nrecs]+kernelhalfside+1); d3from=(vararray[i+nrecs*2]-kernelhalfside)<0 ? 0 : (vararray[i+nrecs*2]-kernelhalfside); d3to=(vararray[i+nrecs*2]+kernelhalfside+1>side ? side : vararray[i+nrecs*2]+kernelhalfside+1); for(d1=d1from,d1kern=vararray[i]-kernelhalfside<0 ? kernelhalfside-vararray[i] : 0; d1<d1to; d1++,d1kern++) { for(d2=d2from,d2p=d1+d2from*side,d2kern=d1kern+((vararray[i+nrecs]-kernelhalfside)<0 ? (kernelhalfside-vararray[i+nrecs])*kernelside : 0) ;d2<d2to ;d2++,d2p+=side,d2kern+=kernelside) { for(d3=d3from,d3p=d2p+d3from*sidesq ,d3kern=&kernel[d2kern+((vararray[i+nrecs*2]-kernelhalfside)<0 ? (kernelhalfside-vararray[i+nrecs*2])*kernelsidesq : 0)] ;d3<d3to ;d3++,d3p+=sidesq,d3kern+=kernelsidesq) { tmpdens[d3p]+=*d3kern * (downweight ? ((float)weight[i]/MULTIPLIER) : 1); //kernel[d3kern]; } } } anythingtosave=true; break; } } } saveKernelDensity(tmpdens,freqs[lastID],&densities[mapIDs[lastID]]); // save kernel density of the last taxon outIDs[mapIDs[lastID]]=lastID; */ // scale all densities to the absolute maximum /* unsigned long maxmax=0; float factor; for(i=0;i<ntaxafiltered;i++) if(maxmax<densities[i].max) maxmax=densities[i].max; for(i=0;i<ntaxafiltered;i++) { factor=(float)densities[i].max/maxmax; for(j=0;j<arraysize;j++) densities[i].density[j]=(unsigned char)((float)densities[i].density[j]*factor); } */ // now write to output file printf("\nWriting file...\n"); densfile=fopen(DENSITYFILE(pfilename,poutfilename),"w"); fwrite(&ntaxafiltered,sizeof(int),1,densfile); // how many densities in file fwrite(outIDs,sizeof(int),ntaxafiltered,densfile); // the real taxon IDs of each density fwrite(&side,sizeof(int),1,densfile); // the size of the grid fwrite(&nvarstouse,sizeof(int),1,densfile); // the number of variables for(k=0;k<ntaxafiltered;k++) { // now the densities! fwrite(&densities[k],sizeof(DENSITY),1,densfile); // of course, the pointer will be meaningless fwrite(densities[k].density,arraysize,1,densfile); } fclose(densfile); #ifdef VERBOSE for(k=3;k<4;k++) { //ntaxafiltered printf("************* Taxon Nrecs: %d ************\n",densities[k].nrecords); switch(nvarstouse) { case 2: for(d1=0;d1<side;d1++) { for(d2=0;d2<side;d2++) { printf("%3d",densities[k].density[d1+d2*side]); } printf("\n"); } break; case 3: for(d1=0;d1<5;d1++) { for(d2=0;d2<side;d2++) { for(d3=0;d3<side;d3++) { printf("%3d",densities[k].density[d1+d2*side+d3*sidesq]); } printf("\n"); } printf("*****************\n"); } break; } printf("********SUM: %f NRECS: %d*********\n",sum,densities[k].nrecords); } for(k=0;k<ntaxafiltered;k++) { for(d1=0,sum=0;d1<arraysize;d1++) sum+=(float)densities[k].density[d1]*densities[k].max/255.0; printf("SUM: %f NRECS: %d\n",sum,densities[k].nrecords); } #endif fclose(varsfile); free(pIDs); free(freqs); free(vararray); free(varheader); free(kernel); free(mapIDs); free(tmpdens); for(i=0;i<ntaxafiltered;i++) free(densities[i].density); free(densities); free(outIDs); (*env)->ReleaseStringUTFChars(env,filename,pfilename); (*env)->ReleaseStringUTFChars(env,outfilename,poutfilename); return 1; }
void PtexSeparableFilter::eval(float* result, int firstChan, int nChannels, int faceid, float u, float v, float uw1, float vw1, float uw2, float vw2, float width, float blur) { // init if (!_tx || nChannels <= 0) return; if (faceid < 0 || faceid >= _tx->numFaces()) return; _ntxchan = _tx->numChannels(); _dt = _tx->dataType(); _firstChanOffset = firstChan*DataSize(_dt); _nchan = PtexUtils::min(nChannels, _ntxchan-firstChan); // get face info const FaceInfo& f = _tx->getFaceInfo(faceid); // if neighborhood is constant, just return constant value of face if (f.isNeighborhoodConstant()) { PtexPtr<PtexFaceData> data ( _tx->getData(faceid, 0) ); if (data) { char* d = (char*) data->getData() + _firstChanOffset; Ptex::ConvertToFloat(result, d, _dt, _nchan); } return; } // find filter width as bounding box of vectors w1 and w2 float uw = fabs(uw1) + fabs(uw2), vw = fabs(vw1) + fabs(vw2); // handle border modes switch (_uMode) { case m_clamp: u = PtexUtils::clamp(u, 0.0f, 1.0f); break; case m_periodic: u = u-floor(u); break; case m_black: break; // do nothing } switch (_vMode) { case m_clamp: v = PtexUtils::clamp(v, 0.0f, 1.0f); break; case m_periodic: v = v-floor(v); case m_black: break; // do nothing } // build kernel PtexSeparableKernel k; if (f.isSubface()) { // for a subface, build the kernel as if it were on a main face and then downres uw = uw * width + blur * 2; vw = vw * width + blur * 2; buildKernel(k, u*.5, v*.5, uw*.5, vw*.5, f.res); if (k.res.ulog2 == 0) k.upresU(); if (k.res.vlog2 == 0) k.upresV(); k.res.ulog2--; k.res.vlog2--; } else { uw = uw * width + blur; vw = vw * width + blur; buildKernel(k, u, v, uw, vw, f.res); } k.stripZeros(); // check kernel (debug only) assert(k.uw > 0 && k.vw > 0); assert(k.uw <= PtexSeparableKernel::kmax && k.vw <= PtexSeparableKernel::kmax); _weight = k.weight(); // allocate temporary double-precision result _result = (double*) alloca(sizeof(double)*_nchan); memset(_result, 0, sizeof(double)*_nchan); // apply to faces splitAndApply(k, faceid, f); // normalize (both for data type and cumulative kernel weight applied) // and output result double scale = 1.0 / (_weight * OneValue(_dt)); for (int i = 0; i < _nchan; i++) result[i] = float(_result[i] * scale); // clear temp result _result = 0; }
CLKernel *EasyCL::buildKernel(string kernelfilepath, string kernelname) { return buildKernel(kernelfilepath, kernelname, ""); }
void VolumeMaxCLProcessor::initialize() { Processor::initialize(); buildKernel(); }
// Serial ray casting unsigned char* raycast_serial(unsigned char* data, unsigned char* region){ unsigned char* image = (unsigned char*)malloc(sizeof(unsigned char)*IMAGE_DIM*IMAGE_DIM); // Camera/eye position, and direction of viewing. These can be changed to look // at the volume from different angles. float3 camera = {.x=1000,.y=1000,.z=1000}; float3 forward = {.x=-1, .y=-1, .z=-1}; float3 z_axis = {.x=0, .y=0, .z = 1}; // Finding vectors aligned with the axis of the image float3 right = cross(forward, z_axis); float3 up = cross(right, forward); // Creating unity lenght vectors forward = normalize(forward); right = normalize(right); up = normalize(up); float fov = 3.14/4; float pixel_width = tan(fov/2.0)/(IMAGE_DIM/2); float step_size = 0.5; // For each pixel for(int y = -(IMAGE_DIM/2); y < (IMAGE_DIM/2); y++){ for(int x = -(IMAGE_DIM/2); x < (IMAGE_DIM/2); x++){ // Find the ray for this pixel float3 screen_center = add(camera, forward); float3 ray = add(add(screen_center, scale(right, x*pixel_width)), scale(up, y*pixel_width)); ray = add(ray, scale(camera, -1)); ray = normalize(ray); float3 pos = camera; // Move along the ray, we stop if the color becomes completely white, // or we've done 5000 iterations (5000 is a bit arbitrary, it needs // to be big enough to let rays pass through the entire volume) int i = 0; float color = 0; while(color < 255 && i < 5000){ i++; pos = add(pos, scale(ray, step_size)); // Update position int r = value_at(pos, region); // Check if we're in the region color += value_at(pos, data)*(0.01 + r) ; // Update the color based on data value, and if we're in the region } // Write final color to image image[(y+(IMAGE_DIM/2)) * IMAGE_DIM + (x+(IMAGE_DIM/2))] = color > 255 ? 255 : color; } } return image; } // Check if two values are similar, threshold can be changed. int similar(unsigned char* data, int3 a, int3 b){ unsigned char va = data[a.z * DATA_DIM*DATA_DIM + a.y*DATA_DIM + a.x]; unsigned char vb = data[b.z * DATA_DIM*DATA_DIM + b.y*DATA_DIM + b.x]; int i = abs(va-vb) < 1; return i; } // Serial region growing, same algorithm as in assignment 2 unsigned char* grow_region_serial(unsigned char* data){ unsigned char* region = (unsigned char*)calloc(sizeof(unsigned char), DATA_DIM*DATA_DIM*DATA_DIM); stack_t* stack = new_stack(); int3 seed = {.x=50, .y=300, .z=300}; push(stack, seed); region[seed.z *DATA_DIM*DATA_DIM + seed.y*DATA_DIM + seed.x] = 1; int dx[6] = {-1,1,0,0,0,0}; int dy[6] = {0,0,-1,1,0,0}; int dz[6] = {0,0,0,0,-1,1}; while(stack->size > 0){ int3 pixel = pop(stack); for(int n = 0; n < 6; n++){ int3 candidate = pixel; candidate.x += dx[n]; candidate.y += dy[n]; candidate.z += dz[n]; if(!inside_int(candidate)){ continue; } if(region[candidate.z * DATA_DIM*DATA_DIM + candidate.y*DATA_DIM + candidate.x]){ continue; } if(similar(data, pixel, candidate)){ push(stack, candidate); region[candidate.z * DATA_DIM*DATA_DIM + candidate.y*DATA_DIM + candidate.x] = 1; } } } return region; } unsigned char* grow_region_gpu(unsigned char* data){ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_kernel kernel; cl_int err; char *source; int i; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); printPlatformInfo(platform); queue = clCreateCommandQueue(context, device, 0, &err); kernel = buildKernel("region.cl", "region", NULL, context, device); //Host variables unsigned char* host_region = (unsigned char*)calloc(sizeof(unsigned char), DATA_SIZE); int host_unfinished; cl_mem device_region = clCreateBuffer(context, CL_MEM_READ_WRITE, DATA_SIZE * sizeof(cl_uchar) ,NULL,&err); cl_mem device_data = clCreateBuffer(context, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(cl_uchar), NULL,&err); cl_mem device_unfinished = clCreateBuffer(context, CL_MEM_READ_WRITE, sizeof(cl_int), NULL,&err); clError("Error allocating memory", err); //plant seed int3 seed = {.x=50, .y=300, .z=300}; host_region[index(seed.z, seed.y, seed.x)] = 2; //Copy data to the device clEnqueueWriteBuffer(queue, device_data , CL_FALSE, 0, DATA_SIZE * sizeof(cl_uchar), data , 0, NULL, NULL); clEnqueueWriteBuffer(queue, device_region, CL_FALSE, 0, DATA_SIZE * sizeof(cl_uchar), host_region, 0, NULL, NULL); //Calculate block and grid sizes size_t global[] = { 512, 512, 512 }; size_t local[] = { 8, 8, 8 }; //Run kernel untill completion do{ host_unfinished = 0; clEnqueueWriteBuffer(queue, device_unfinished, CL_FALSE, 0, sizeof(cl_int), &host_unfinished , 0, NULL, NULL); clFinish(queue); err = clSetKernelArg(kernel, 0, sizeof(device_data), (void*)&device_data); err = clSetKernelArg(kernel, 1, sizeof(device_region), (void*)&device_region); err = clSetKernelArg(kernel, 2, sizeof(device_unfinished), (void*)&device_unfinished); clError("Error setting arguments", err); //Run the kernel clEnqueueNDRangeKernel(queue, kernel, 3, NULL, &global, &local, 0, NULL, NULL); clFinish(queue); clError("Error running kernel", err); err = clEnqueueReadBuffer(queue, device_unfinished, CL_TRUE, 0, sizeof(cl_int), &host_unfinished, 0, NULL, NULL); clFinish(queue); clError("Error reading buffer 1", err); }while(host_unfinished); //Copy result to host err = clEnqueueReadBuffer(queue, device_region, CL_TRUE, 0, DATA_SIZE * sizeof(cl_uchar), host_region, 0, NULL, NULL); clFinish(queue); clError("Error reading buffer 2", err); return host_region; } unsigned char* raycast_gpu(unsigned char* data, unsigned char* region){ cl_platform_id platform; cl_device_id device; cl_context context; cl_command_queue queue; cl_kernel kernel; cl_int err; char *source; int i; clGetPlatformIDs(1, &platform, NULL); clGetDeviceIDs(platform, CL_DEVICE_TYPE_GPU, 1, &device, NULL); context = clCreateContext(NULL, 1, &device, NULL, NULL, &err); printPlatformInfo(platform); printDeviceInfo(device); queue = clCreateCommandQueue(context, device, 0, &err); kernel = buildKernel("raycast.cl", "raycast", NULL, context, device); cl_mem device_region = clCreateBuffer(context, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(cl_uchar) ,NULL,&err); cl_mem device_data = clCreateBuffer(context, CL_MEM_READ_ONLY, DATA_SIZE * sizeof(cl_uchar), NULL,&err); cl_mem device_image = clCreateBuffer(context, CL_MEM_READ_WRITE, IMAGE_SIZE * sizeof(cl_uchar),NULL,&err); clError("Error allocating memory", err); //Copy data to the device clEnqueueWriteBuffer(queue, device_data , CL_FALSE, 0, DATA_SIZE * sizeof(cl_uchar), data , 0, NULL, NULL); clEnqueueWriteBuffer(queue, device_region, CL_FALSE, 0, DATA_SIZE * sizeof(cl_uchar), region, 0, NULL, NULL); int grid_size = IMAGE_DIM; int block_size = IMAGE_DIM; //Set up kernel arguments err = clSetKernelArg(kernel, 0, sizeof(device_data), (void*)&device_data); err = clSetKernelArg(kernel, 1, sizeof(device_region), (void*)&device_region); err = clSetKernelArg(kernel, 2, sizeof(device_image), (void*)&device_image); clError("Error setting arguments", err); //Run the kernel const size_t globalws[2] = {IMAGE_DIM, IMAGE_DIM}; const size_t localws[2] = {8, 8}; clEnqueueNDRangeKernel(queue, kernel, 2, NULL, &globalws, &localws, 0, NULL, NULL); clFinish(queue); //Allocate memory for the result unsigned char* host_image = (unsigned char*)malloc(IMAGE_SIZE_BYTES); //Copy result from device err = clEnqueueReadBuffer(queue, device_image, CL_TRUE, 0, IMAGE_SIZE * sizeof(cl_uchar), host_image, 0, NULL, NULL); clFinish(queue); //Free device memory return host_image; } int main(int argc, char** argv){ unsigned char* data = create_data(); unsigned char* region = grow_region_gpu(data); unsigned char* image = raycast_gpu(data, region); write_bmp(image, IMAGE_DIM, IMAGE_DIM); }