void mean_first(Param out, Param in, Param inWeight) { uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0])); threads_x = std::min(threads_x, THREADS_PER_GROUP); uint threads_y = THREADS_PER_GROUP / threads_x; uint groups_x = divup(in.info.dims[0], threads_x * REPEAT); uint groups_y = divup(in.info.dims[1], threads_y); Param tmpOut = out; Param noWeight; noWeight.info.offset = 0; for (int k = 0; k < 4; ++k) { noWeight.info.dims[k] = 0; noWeight.info.strides[k] = 0; } // Does not matter what the value is it will not be used. Just needs to be valid. noWeight.data = inWeight.data; Param tmpWeight = noWeight; if (groups_x > 1) { tmpOut.data = bufferAlloc(groups_x * in.info.dims[1] * in.info.dims[2] * in.info.dims[3] * sizeof(To)); tmpWeight.data = bufferAlloc(groups_x * in.info.dims[1] * in.info.dims[2] * in.info.dims[3] * sizeof(Tw)); tmpOut.info.dims[0] = groups_x; for (int k = 1; k < 4; k++) tmpOut.info.strides[k] *= groups_x; tmpWeight.info = tmpOut.info; } mean_first_launcher<Ti, Tw, To>(tmpOut, tmpWeight, in, inWeight, threads_x, groups_x, groups_y); if (groups_x > 1) { // No Weight is needed when writing out the output. mean_first_launcher<Ti, Tw, To>(out, noWeight, tmpOut, tmpWeight, threads_x, 1, groups_y); bufferFree(tmpOut.data); bufferFree(tmpWeight.data); } }
bool rtspRequestSetContent(RTSPRequest *rtspRequest, uint8_t *content, size_t contentSize, char *contentType) { char contentSizeString[MAX_NUMBER_STRING_SIZE]; /* Add header field for content type */ if(!rtspRequestAddHeaderField(rtspRequest, "Content-Type", contentType)) { return false; } /* Add header field for content size */ sprintf(contentSizeString, "%lu", (unsigned long)contentSize); if(!rtspRequestAddHeaderField(rtspRequest, "Content-Length", contentSizeString)) { return false; } /* Free any existing buffer */ if(rtspRequest->contentBuffer != NULL) { if(!bufferFree(&rtspRequest->contentBuffer)) { return false; } } /* Allocate buffer */ if(!bufferAllocate(&rtspRequest->contentBuffer, contentSize, "RTSP request content buffer")) { return false; } /* Copy buffer */ memcpy(rtspRequest->contentBuffer, content, contentSize); rtspRequest->contentBufferSize = contentSize; return true; }
static void scan_dim(Param &out, const Param &in, int dim) { uint threads_y = std::min(THREADS_Y, nextpow2(out.info.dims[dim])); uint threads_x = THREADS_X; uint groups_all[] = {divup((uint)out.info.dims[0], threads_x), (uint)out.info.dims[1], (uint)out.info.dims[2], (uint)out.info.dims[3]}; groups_all[dim] = divup(out.info.dims[dim], threads_y * REPEAT); if (groups_all[dim] == 1) { scan_dim_launcher<Ti, To, op, inclusive_scan>(out, out, in, dim, true, threads_y, groups_all); } else { Param tmp = out; tmp.info.dims[dim] = groups_all[dim]; tmp.info.strides[0] = 1; for (int k = 1; k < 4; k++) { tmp.info.strides[k] = tmp.info.strides[k - 1] * tmp.info.dims[k - 1]; } int tmp_elements = tmp.info.strides[3] * tmp.info.dims[3]; // FIXME: Do I need to free this ? tmp.data = bufferAlloc(tmp_elements * sizeof(To)); scan_dim_launcher<Ti, To, op, inclusive_scan>(out, tmp, in, dim, false, threads_y, groups_all); int gdim = groups_all[dim]; groups_all[dim] = 1; if (op == af_notzero_t) { scan_dim_launcher<To, To, af_add_t, true>(tmp, tmp, tmp, dim, true, threads_y, groups_all); } else { scan_dim_launcher<To, To, op, true>(tmp, tmp, tmp, dim, true, threads_y, groups_all); } groups_all[dim] = gdim; bcast_dim_launcher<To, To, op, inclusive_scan>(out, tmp, dim, true, threads_y, groups_all); bufferFree(tmp.data); } }
bool protocolCleanup(int pSocket, fd_set *pSocketSet, int pMaxSocket, int pBufferSize) { UNUSED(pSocket); UNUSED(pSocketSet); UNUSED(pMaxSocket); UNUSED(pBufferSize); return bufferFree(&gMessageBuffer); }
/** * Called when the peer dies. */ static void peerProxyKill(PeerProxy* peerProxy, bool errnoIsSet) { if (errnoIsSet) { ALOGI("Peer %d died. errno: %s", peerProxy->credentials.pid, strerror(errno)); } else { ALOGI("Peer %d died.", peerProxy->credentials.pid); } // If we lost the master, we're up a creek. We can't let this happen. if (peerProxy->master) { LOG_ALWAYS_FATAL("Lost connection to master."); } Peer* localPeer = peerProxy->peer; pid_t pid = peerProxy->credentials.pid; peerLock(localPeer); // Remember for awhile that the peer died. localPeer->deadPeers[localPeer->deadPeerCursor] = peerProxy->credentials.pid; localPeer->deadPeerCursor++; if (localPeer->deadPeerCursor == PEER_HISTORY) { localPeer->deadPeerCursor = 0; } // Remove from peer map. hashmapRemove(localPeer->peerProxies, &pid); // External threads can no longer get to this peer proxy, so we don't // need the lock anymore. peerUnlock(localPeer); // Remove the fd from the selector. if (peerProxy->fd != NULL) { peerProxy->fd->remove = true; } // Clear outgoing packet queue. while (peerProxyNextPacket(peerProxy)) {} bufferFree(peerProxy->inputBuffer); // This only applies to the master. if (peerProxy->connections != NULL) { // We can't leave these other maps pointing to freed memory. hashmapForEach(peerProxy->connections, &peerProxyRemoveConnection, peerProxy); hashmapFree(peerProxy->connections); } // Invoke death listener. localPeer->onDeath(pid); // Free the peer proxy itself. free(peerProxy); }
void convolve2(Param out, const Param signal, const Param filter) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> convProgs; static std::map<int, Kernel*> convKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { const size_t C0_SIZE = (THREADS_X+2*(fLen-1))* THREADS_Y; const size_t C1_SIZE = (THREADS_Y+2*(fLen-1))* THREADS_X; size_t locSize = (conv_dim==0 ? C0_SIZE : C1_SIZE); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D accType="<< dtype_traits<accType>::getName() << " -D CONV_DIM="<< conv_dim << " -D EXPAND="<< expand << " -D FLEN="<< fLen << " -D LOCAL_MEM_SIZE="<<locSize; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, convolve_separable_cl, convolve_separable_cl_len, options.str()); convProgs[device] = new Program(prog); convKernels[device] = new Kernel(*convProgs[device], "convolve"); }); auto convOp = make_kernel<Buffer, KParam, Buffer, KParam, Buffer, int, int>(*convKernels[device]); NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(out.info.dims[0], THREADS_X); int blk_y = divup(out.info.dims[1], THREADS_Y); NDRange global(blk_x*signal.info.dims[2]*THREADS_X, blk_y*signal.info.dims[3]*THREADS_Y); cl::Buffer *mBuff = bufferAlloc(fLen*sizeof(accType)); // FIX ME: if the filter array is strided, direct might cause issues getQueue().enqueueCopyBuffer(*filter.data, *mBuff, 0, 0, fLen*sizeof(accType)); convOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *signal.data, signal.info, *mBuff, blk_x, blk_y); bufferFree(mBuff); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
bool rtspRequestFree(RTSPRequest **rtspRequest) { bool result; /* Close all opened/allocated resource. Continu if a failure occurs, but remember failure for final result. */ result = true; if(*rtspRequest != NULL) { if(!bufferFree(&(*rtspRequest)->headerBuffer)) { result = false; } if(!bufferFree(&(*rtspRequest)->contentBuffer)) { result = false; } if(!bufferFree(rtspRequest)) { result = false; } } return result; }
Array<T> index(const Array<T>& in, const af_index_t idxrs[]) { kernel::IndexKernelParam_t p; std::vector<af_seq> seqs(4, af_span); // create seq vector to retrieve output // dimensions, offsets & offsets for (dim_t x=0; x<4; ++x) { if (idxrs[x].isSeq) { seqs[x] = idxrs[x].idx.seq; } } // retrieve dimensions, strides and offsets dim4 iDims = in.dims(); dim4 dDims = in.getDataDims(); dim4 oDims = toDims (seqs, iDims); dim4 iOffs = toOffset(seqs, dDims); dim4 iStrds= toStride(seqs, dDims); for (dim_t i=0; i<4; ++i) { p.isSeq[i] = idxrs[i].isSeq; p.offs[i] = iOffs[i]; p.strds[i] = iStrds[i]; } Buffer* bPtrs[4]; std::vector< Array<uint> > idxArrs(4, createEmptyArray<uint>(dim4())); // look through indexs to read af_array indexs for (dim_t x=0; x<4; ++x) { // set index pointers were applicable if (!p.isSeq[x]) { idxArrs[x] = castArray<uint>(idxrs[x].idx.arr); bPtrs[x] = idxArrs[x].get(); // set output array ith dimension value oDims[x] = idxArrs[x].elements(); } else { // alloc an 1-element buffer to avoid OpenCL from failing bPtrs[x] = bufferAlloc(sizeof(uint)); } } Array<T> out = createEmptyArray<T>(oDims); if(oDims.elements() == 0) { return out; } kernel::index<T>(out, in, p, bPtrs); for (dim_t x=0; x<4; ++x) { if (p.isSeq[x]) bufferFree(bPtrs[x]); } return out; }
/* * Destructor */ BufferManager::~BufferManager() { bufferFree(&bufferGPS); bufferFree(&bufferLinearAcceleration); bufferFree(&bufferAngularAcceleration); bufferFree(&bufferHumidity); bufferFree(&bufferTemperature); bufferFree(&bufferLight); bufferFree(&bufferAlert); }
void morph3d(Param out, const Param in, const Param mask) { std::string refName = std::string("morph3d_") + std::string(dtype_traits<T>::getName()) + std::to_string(isDilation) + std::to_string(SeLength); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog==0 && entry.ker==0) { std::string options = generateOptionsString<T, isDilation, SeLength>(); const char* ker_strs[] = {morph_cl}; const int ker_lens[] = {morph_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "morph3d"); addKernelToCache(device, refName, entry); } auto morphOp = KernelFunctor< Buffer, KParam, Buffer, KParam, Buffer, cl::LocalSpaceArg, int >(*entry.ker); NDRange local(CUBE_X, CUBE_Y, CUBE_Z); int blk_x = divup(in.info.dims[0], CUBE_X); int blk_y = divup(in.info.dims[1], CUBE_Y); int blk_z = divup(in.info.dims[2], CUBE_Z); // launch batch * blk_x blocks along x dimension NDRange global(blk_x * CUBE_X * in.info.dims[3], blk_y * CUBE_Y, blk_z * CUBE_Z); // copy mask/filter to constant memory cl_int se_size = sizeof(T)*SeLength*SeLength*SeLength; cl::Buffer *mBuff = bufferAlloc(se_size); getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size); // calculate shared memory size const int padding = (SeLength%2==0 ? (SeLength-1) : (2*(SeLength/2))); const int locLen = CUBE_X+padding+1; const int locArea = locLen *(CUBE_Y+padding); const int locSize = locArea*(CUBE_Z+padding); morphOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *mBuff, cl::Local(locSize*sizeof(T)), blk_x); bufferFree(mBuff); CL_DEBUG_FINISH(getQueue()); }
unsigned nonMaximal(cl::Buffer* x_out, cl::Buffer* y_out, cl::Buffer* resp_out, const unsigned idim0, const unsigned idim1, const cl::Buffer* resp_in, const unsigned edge, const unsigned max_corners) { unsigned corners_found = 0; std::string refName = std::string("non_maximal_") + std::string(dtype_traits<T>::getName()); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, refName); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D NONMAX"; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) options << " -D USE_DOUBLE"; const char* ker_strs[] = {susan_cl}; const int ker_lens[] = {susan_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel(*entry.prog, "non_maximal"); addKernelToCache(device, refName, entry); } cl::Buffer* d_corners_found = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); auto nonMaximalOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer, unsigned, unsigned, Buffer, unsigned, unsigned>(*entry.ker); NDRange local(SUSAN_THREADS_X, SUSAN_THREADS_Y); NDRange global(divup(idim0 - 2 * edge, local[0]) * local[0], divup(idim1 - 2 * edge, local[1]) * local[1]); nonMaximalOp(EnqueueArgs(getQueue(), global, local), *x_out, *y_out, *resp_out, *d_corners_found, idim0, idim1, *resp_in, edge, max_corners); getQueue().enqueueReadBuffer(*d_corners_found, CL_TRUE, 0, sizeof(unsigned), &corners_found); bufferFree(d_corners_found); return corners_found; }
bool rtspRequestAddHeaderField(RTSPRequest *rtspRequest, const char *fieldName, const char *fieldValue) { size_t fieldNameLength; size_t fieldValueLength; int charsWritten; /* Allocate initial buffer if required */ if(rtspRequest->headerBuffer == NULL) { rtspRequest->maxHeaderBufferSize = HEADER_BUFFER_INITIAL_SIZE; if(!bufferAllocate(&rtspRequest->headerBuffer, rtspRequest->maxHeaderBufferSize, "RTSP request header buffer")) { return false; } rtspRequest->headerBuffer[0] = '\0'; rtspRequest->headerBufferSize = 1; /* The '\0' byte */ } /* Decide if enough space is available in buffer and add space if necessary. Add 4 bytes for ": " and "\r\n". */ fieldNameLength = strlen(fieldName); fieldValueLength = strlen(fieldValue); if(!bufferMakeRoom(&rtspRequest->headerBuffer, &rtspRequest->maxHeaderBufferSize, rtspRequest->headerBufferSize, fieldNameLength + fieldValueLength + 4, HEADER_BUFFER_INCREMENT_SIZE)) { logWrite(LOG_LEVEL_ERROR, LOG_COMPONENT_NAME, "Cannot allocate memory to add field \"%s\" to RTSP Request header.", fieldName); bufferFree(&rtspRequest->headerBuffer); rtspRequest->headerBuffer = NULL; rtspRequest->headerBufferSize = 0; return false; } /* Add field name and value to buffer */ /* Offet (headerBufferSize - 1) to overwrite existing '\0' byte. A new '\0' byte will be added at the end. */ charsWritten = snprintf((char *)rtspRequest->headerBuffer + rtspRequest->headerBufferSize - 1, rtspRequest->maxHeaderBufferSize - (rtspRequest->headerBufferSize - 1), "%s: %s\r\n", fieldName, fieldValue); if(charsWritten != fieldNameLength + fieldValueLength + 4) { logWrite(LOG_LEVEL_ERROR, LOG_COMPONENT_NAME, "Cannot add field \"%s\" to RTSP Request header.", fieldName); return false; } /* Update buffer data */ rtspRequest->headerBufferSize += charsWritten; return true; }
static void computerTransmitTaskLoop(void *parameters) { struct dataQueueEntry entry; while(1) { // Wait for available data xQueueReceive(computerOutputQueue, &entry, portMAX_DELAY); sendByteSerial(START_BYTE); unsigned char csum = 0; sendByteSerial(entry.length); csum += entry.length; int i; for (i = 0; i < entry.length; i++) { sendByteSerial(entry.buffer->data[i]); csum += entry.buffer->data[i]; } sendByteSerial(255 - csum); bufferFree(entry.buffer); } }
static void where(Param &out, Param &in) { uint threads_x = nextpow2(std::max(32u, (uint)in.info.dims[0])); threads_x = std::min(threads_x, THREADS_PER_GROUP); uint threads_y = THREADS_PER_GROUP / threads_x; uint groups_x = divup(in.info.dims[0], threads_x * REPEAT); uint groups_y = divup(in.info.dims[1], threads_y); Param rtmp; Param otmp; rtmp.info.dims[0] = groups_x; otmp.info.dims[0] = in.info.dims[0]; rtmp.info.strides[0] = 1; otmp.info.strides[0] = 1; rtmp.info.offset = 0; otmp.info.offset = 0; for (int k = 1; k < 4; k++) { rtmp.info.dims[k] = in.info.dims[k]; rtmp.info.strides[k] = rtmp.info.strides[k - 1] * rtmp.info.dims[k - 1]; otmp.info.dims[k] = in.info.dims[k]; otmp.info.strides[k] = otmp.info.strides[k - 1] * otmp.info.dims[k - 1]; } int rtmp_elements = rtmp.info.strides[3] * rtmp.info.dims[3]; rtmp.data = bufferAlloc(rtmp_elements * sizeof(uint)); int otmp_elements = otmp.info.strides[3] * otmp.info.dims[3]; otmp.data = bufferAlloc(otmp_elements * sizeof(uint)); scan_first_launcher<T, uint, af_notzero_t>(otmp, rtmp, in, false, groups_x, groups_y, threads_x); // Linearize the dimensions and perform scan Param ltmp = rtmp; ltmp.info.offset = 0; ltmp.info.dims[0] = rtmp_elements; for (int k = 1; k < 4; k++) { ltmp.info.dims[k] = 1; ltmp.info.strides[k] = rtmp_elements; } scan_first<uint, uint, af_add_t>(ltmp, ltmp); // Get output size and allocate output uint total; getQueue().enqueueReadBuffer(*rtmp.data, CL_TRUE, sizeof(uint) * (rtmp_elements - 1), sizeof(uint), &total); out.data = bufferAlloc(total * sizeof(uint)); out.info.dims[0] = total; out.info.strides[0] = 1; for (int k = 1; k < 4; k++) { out.info.dims[k] = 1; out.info.strides[k] = total; } if (total > 0) get_out_idx<T>(out.data, otmp, rtmp, in, threads_x, groups_x, groups_y); bufferFree(rtmp.data); bufferFree(otmp.data); }
/** Frees a packet of bytes. */ static void outgoingPacketFreeBytes(OutgoingPacket* packet) { ALOGD("Freeing outgoing packet."); bufferFree(packet->bytes); free(packet); }
void csrmm_nt(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; std::string ref_name = std::string("csrmm_nt_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog == 0 && entry.ker == 0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS_PER_GROUP=" << THREADS_PER_GROUP; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmm_cl}; const int ker_lens[] = {csrmm_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmm_nt"); // FIXME: Change this after adding another kernel entry.ker[1] = Kernel(*entry.prog, "csrmm_nt"); addKernelToCache(device, ref_name, entry); } auto csrmm_nt_kernel = entry.ker[0]; auto csrmm_nt_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, int, Buffer, KParam, T, T, Buffer>(csrmm_nt_kernel); NDRange local(THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int N = rhs.info.dims[0]; int groups_x = divup(N, local[0]); int groups_y = divup(M, REPEAT); groups_y = std::min(groups_y, MAX_CSRMM_GROUPS); NDRange global(local[0] * groups_x, local[1] * groups_y); std::vector<int> count(groups_x); cl::Buffer *counter = bufferAlloc(count.size() * sizeof(int)); getQueue().enqueueWriteBuffer( *counter, CL_TRUE, 0, count.size() * sizeof(int), (void *)count.data()); csrmm_nt_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, N, *rhs.data, rhs.info, alpha, beta, *counter); bufferFree(counter); }
void fast(unsigned* out_feat, Param &x_out, Param &y_out, Param &score_out, Param in, const float thr, const float feature_ratio, const unsigned edge) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> fastProgs; static std::map<int, Kernel*> lfKernel; static std::map<int, Kernel*> nmKernel; static std::map<int, Kernel*> gfKernel; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D ARC_LENGTH=" << arc_length << " -D NONMAX=" << static_cast<unsigned>(nonmax); if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } cl::Program prog; buildProgram(prog, fast_cl, fast_cl_len, options.str()); fastProgs[device] = new Program(prog); lfKernel[device] = new Kernel(*fastProgs[device], "locate_features"); nmKernel[device] = new Kernel(*fastProgs[device], "non_max_counts"); gfKernel[device] = new Kernel(*fastProgs[device], "get_features"); }); const unsigned max_feat = ceil(in.info.dims[0] * in.info.dims[1] * feature_ratio); // Matrix containing scores for detected features, scores are stored in the // same coordinates as features, dimensions should be equal to in. cl::Buffer *d_score = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(float)); std::vector<float> score_init(in.info.dims[0] * in.info.dims[1], (float)0); getQueue().enqueueWriteBuffer(*d_score, CL_TRUE, 0, in.info.dims[0] * in.info.dims[1] * sizeof(float), &score_init[0]); cl::Buffer *d_flags = d_score; if (nonmax) { d_flags = bufferAlloc(in.info.dims[0] * in.info.dims[1] * sizeof(T)); } const int blk_x = divup(in.info.dims[0]-edge*2, FAST_THREADS_X); const int blk_y = divup(in.info.dims[1]-edge*2, FAST_THREADS_Y); // Locate features kernel sizes const NDRange local(FAST_THREADS_X, FAST_THREADS_Y); const NDRange global(blk_x * FAST_THREADS_X, blk_y * FAST_THREADS_Y); auto lfOp = make_kernel<Buffer, KParam, Buffer, const float, const unsigned, LocalSpaceArg> (*lfKernel[device]); lfOp(EnqueueArgs(getQueue(), global, local), *in.data, in.info, *d_score, thr, edge, cl::Local((FAST_THREADS_X + 6) * (FAST_THREADS_Y + 6) * sizeof(T))); CL_DEBUG_FINISH(getQueue()); const int blk_nonmax_x = divup(in.info.dims[0], 64); const int blk_nonmax_y = divup(in.info.dims[1], 64); // Nonmax kernel sizes const NDRange local_nonmax(FAST_THREADS_NONMAX_X, FAST_THREADS_NONMAX_Y); const NDRange global_nonmax(blk_nonmax_x * FAST_THREADS_NONMAX_X, blk_nonmax_y * FAST_THREADS_NONMAX_Y); unsigned count_init = 0; cl::Buffer *d_total = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &count_init); //size_t *global_nonmax_dims = global_nonmax(); size_t blocks_sz = blk_nonmax_x * FAST_THREADS_NONMAX_X * blk_nonmax_y * FAST_THREADS_NONMAX_Y * sizeof(unsigned); cl::Buffer *d_counts = bufferAlloc(blocks_sz); cl::Buffer *d_offsets = bufferAlloc(blocks_sz); auto nmOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned> (*nmKernel[device]); nmOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *d_counts, *d_offsets, *d_total, *d_flags, *d_score, in.info, edge); CL_DEBUG_FINISH(getQueue()); unsigned total; getQueue().enqueueReadBuffer(*d_total, CL_TRUE, 0, sizeof(unsigned), &total); total = total < max_feat ? total : max_feat; if (total > 0) { size_t out_sz = total * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); auto gfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const unsigned, const unsigned> (*gfKernel[device]); gfOp(EnqueueArgs(getQueue(), global_nonmax, local_nonmax), *x_out.data, *y_out.data, *score_out.data, *d_flags, *d_counts, *d_offsets, in.info, total, edge); CL_DEBUG_FINISH(getQueue()); } *out_feat = total; x_out.info.dims[0] = total; x_out.info.strides[0] = 1; y_out.info.dims[0] = total; y_out.info.strides[0] = 1; score_out.info.dims[0] = total; score_out.info.strides[0] = 1; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = total; y_out.info.dims[k] = 1; y_out.info.strides[k] = total; score_out.info.dims[k] = 1; score_out.info.strides[k] = total; } bufferFree(d_score); if (nonmax) bufferFree(d_flags); bufferFree(d_total); bufferFree(d_counts); bufferFree(d_offsets); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void memFree(T *ptr) { return bufferFree((cl::Buffer *)ptr); }
void morph(Param out, const Param in, const Param mask) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static std::map<int, Program*> morProgs; static std::map<int, Kernel*> morKernels; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { ToNumStr<T> toNumStr; T init = isDilation ? Binary<T, af_max_t>().init() : Binary<T, af_min_t>().init(); std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D isDilation="<< isDilation << " -D init=" << toNumStr(init) << " -D windLen=" << windLen; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } Program prog; buildProgram(prog, morph_cl, morph_cl_len, options.str()); morProgs[device] = new Program(prog); morKernels[device] = new Kernel(*morProgs[device], "morph"); }); auto morphOp = KernelFunctor<Buffer, KParam, Buffer, KParam, Buffer, cl::LocalSpaceArg, int, int >(*morKernels[device]); NDRange local(THREADS_X, THREADS_Y); int blk_x = divup(in.info.dims[0], THREADS_X); int blk_y = divup(in.info.dims[1], THREADS_Y); // launch batch * blk_x blocks along x dimension NDRange global(blk_x * THREADS_X * in.info.dims[2], blk_y * THREADS_Y * in.info.dims[3]); // copy mask/filter to constant memory cl_int se_size = sizeof(T)*windLen*windLen; cl::Buffer *mBuff = bufferAlloc(se_size); getQueue().enqueueCopyBuffer(*mask.data, *mBuff, 0, 0, se_size); // calculate shared memory size const int halo = windLen/2; const int padding = 2*halo; const int locLen = THREADS_X + padding + 1; const int locSize = locLen * (THREADS_Y+padding); morphOp(EnqueueArgs(getQueue(), global, local), *out.data, out.info, *in.data, in.info, *mBuff, cl::Local(locSize*sizeof(T)), blk_x, blk_y); bufferFree(mBuff); CL_DEBUG_FINISH(getQueue()); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
void nearest_neighbour(Param idx, Param dist, Param query, Param train, const dim_t dist_dim, const unsigned n_dist) { try { const unsigned feat_len = query.info.dims[dist_dim]; const To max_dist = maxval<To>(); // Determine maximum feat_len capable of using shared memory (faster) cl_ulong avail_lmem = getDevice().getInfo<CL_DEVICE_LOCAL_MEM_SIZE>(); size_t lmem_predef = 2 * THREADS * sizeof(unsigned) + feat_len * sizeof(T); size_t ltrain_sz = THREADS * feat_len * sizeof(T); bool use_lmem = (avail_lmem >= (lmem_predef + ltrain_sz)) ? true : false; size_t lmem_sz = (use_lmem) ? lmem_predef + ltrain_sz : lmem_predef; unsigned unroll_len = nextpow2(feat_len); if (unroll_len != feat_len) unroll_len = 0; std::string ref_name = std::string("knn_") + std::to_string(dist_type) + std::string("_") + std::to_string(use_lmem) + std::string("_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(unroll_len); int device = getActiveDeviceId(); kc_t::iterator cache_idx = kernelCaches[device].find(ref_name); kc_entry_t entry; if (cache_idx == kernelCaches[device].end()) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D To=" << dtype_traits<To>::getName() << " -D THREADS=" << THREADS << " -D FEAT_LEN=" << unroll_len; switch(dist_type) { case AF_SAD: options <<" -D DISTOP=_sad_"; break; case AF_SSD: options <<" -D DISTOP=_ssd_"; break; case AF_SHD: options <<" -D DISTOP=_shd_ -D __SHD__"; break; default: break; } if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (use_lmem) options << " -D USE_LOCAL_MEM"; cl::Program prog; buildProgram(prog, nearest_neighbour_cl, nearest_neighbour_cl_len, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[3]; entry.ker[0] = Kernel(*entry.prog, "nearest_neighbour_unroll"); entry.ker[1] = Kernel(*entry.prog, "nearest_neighbour"); entry.ker[2] = Kernel(*entry.prog, "select_matches"); kernelCaches[device][ref_name] = entry; } else { entry = cache_idx->second; } const dim_t sample_dim = (dist_dim == 0) ? 1 : 0; const unsigned nquery = query.info.dims[sample_dim]; const unsigned ntrain = train.info.dims[sample_dim]; unsigned nblk = divup(ntrain, THREADS); const NDRange local(THREADS, 1); const NDRange global(nblk * THREADS, 1); cl::Buffer *d_blk_idx = bufferAlloc(nblk * nquery * sizeof(unsigned)); cl::Buffer *d_blk_dist = bufferAlloc(nblk * nquery * sizeof(To)); // For each query vector, find training vector with smallest Hamming // distance per CUDA block if (unroll_len > 0) { auto huOp = KernelFunctor<Buffer, Buffer, Buffer, KParam, Buffer, KParam, const To, LocalSpaceArg> (entry.ker[0]); huOp(EnqueueArgs(getQueue(), global, local), *d_blk_idx, *d_blk_dist, *query.data, query.info, *train.data, train.info, max_dist, cl::Local(lmem_sz)); } else { auto hmOp = KernelFunctor<Buffer, Buffer, Buffer, KParam, Buffer, KParam, const To, const unsigned, LocalSpaceArg> (entry.ker[1]); hmOp(EnqueueArgs(getQueue(), global, local), *d_blk_idx, *d_blk_dist, *query.data, query.info, *train.data, train.info, max_dist, feat_len, cl::Local(lmem_sz)); } CL_DEBUG_FINISH(getQueue()); const NDRange local_sm(32, 8); const NDRange global_sm(divup(nquery, 32) * 32, 8); // Reduce all smallest Hamming distances from each block and store final // best match auto smOp = KernelFunctor<Buffer, Buffer, Buffer, Buffer, const unsigned, const unsigned, const To> (entry.ker[2]); smOp(EnqueueArgs(getQueue(), global_sm, local_sm), *idx.data, *dist.data, *d_blk_idx, *d_blk_dist, nquery, nblk, max_dist); CL_DEBUG_FINISH(getQueue()); bufferFree(d_blk_idx); bufferFree(d_blk_dist); } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
/* configure supplicant & update driver according to new mode of operation */ void wsc_supplicant_associate(void* h_wpa_s, void* h_ssid, u32 WscMode) { int i; bufferObj probeReqBuf; struct wpa_supplicant *wpa_s = h_wpa_s; struct wpa_ssid *ssid = h_ssid; /* if the current state is WSC_STATE_SUCCESS this means that this is the second association in the WPS sequence and we are not supposed to start the EAP-WSC mechnaism. */ if(WscSupplicantConfig.smState == WSC_STATE_SUCCESS) { WscSupplicantConfig.smState = WSC_STATE_IDLE; return; } /* if the current state is WSC_STATE_IDLE this means that this is a interruption of the current WPS session. */ if(WscSupplicantConfig.smState != WSC_STATE_IDLE) { if(WscSupplicantConfig.WscMode == WSC_MODE_PBC) { /* If Supplicant is in a process of a Simple Config handhsake and in PBC mode - Cancel registration to 2 Min. Walk-Time PushButton timeout */ eloop_cancel_timeout(wsc_supplicant_PushButtonWalktimeTimeout, NULL, NULL); } /* update driver */ wpa_drv_set_wsc_mode(h_wpa_s, WSC_MODE_OFF, NULL, 0); } wpa_printf(MSG_INFO,"wsc_supplicant: Entered wsc_supplicant_associate"); /* init params */ WscSupplicantConfig.version = DEF_CONFIG_VERSION; WscSupplicantConfig.configMethods = DEF_CONFIG_CONFIG_METHODS; { char temp[10]; char *p = DEF_CONFIG_UUID; temp[0] = '0'; temp[1] = 'x'; /* move past the '0x' on the first pass */ for (i = 0; i <= 15; i++) { p += 2; strncpy(&temp[2], p, 2); WscSupplicantConfig.uuidE[i] = (u8) (strtoul(temp, NULL, 16)); } } WscSupplicantConfig.primaryDeviceType.category_id = DEF_CONFIG_PRIMARY_DEV_CATEGORY; WscSupplicantConfig.primaryDeviceType.oui = DEF_CONFIG_PRIMARY_DEV_OUI; WscSupplicantConfig.primaryDeviceType.sub_category_id = DEF_CONFIG_PRIMARY_DEV_SUB_CATEGORY; WscSupplicantConfig.rfBand = DEF_CONFIG_RF_BAND; WscSupplicantConfig.devicePasswordId = WSC_DEVICEPWDID_DEFAULT; /* Default (PIN) */ WscSupplicantConfig.authenticationTypeFlags = DEF_CONFIG_AUTH_TYPE_FLAGS; memcpy(WscSupplicantConfig.macAddress, wpa_s->own_addr, ETH_ALEN); WscSupplicantConfig.encryptionTypeFlags = DEF_CONFIG_ENCR_TYPE_FLAGS; WscSupplicantConfig.connectionTypeFlags = DEF_CONFIG_CONN_TYPE_FLAGS; WscSupplicantConfig.state = 0; WscSupplicantConfig.assocState = 0; WscSupplicantConfig.configError = 0; sprintf(WscSupplicantConfig.manufacturer, "%s", DEF_CONFIG_MANUFACTURER); sprintf(WscSupplicantConfig.modelName, "%s", DEF_CONFIG_MODEL_NAME); sprintf(WscSupplicantConfig.modelNumber, "%s", DEF_CONFIG_MODEL_NUMBER); sprintf(WscSupplicantConfig.serialNumber, "%s", DEF_CONFIG_SERIAL_NUMBER); sprintf(WscSupplicantConfig.deviceName, "%s", DEF_CONFIG_DEVICE_NAME); WscSupplicantConfig.osVersion = DEF_CONFIG_OS_VERSION; WscSupplicantConfig.ssid = ssid; WscSupplicantConfig.wpa_s = wpa_s; switch (WscMode) { case WSC_MODE_PIN: if(!ssid->wsc_pin) { char c_devPwd[32]; u8 devPwd[10]; u32 val; u32 checksum; RAND_bytes(devPwd, LONG_PIN_LEN); sprintf(c_devPwd, "%08u", *(u32 *)devPwd); /* Compute the checksum */ c_devPwd[7] = '\0'; val = strtoul(c_devPwd, NULL, 10 ); checksum = wsc_supplicant_ComputeChecksum( val ); val = val*10 + checksum; sprintf((char *)(WscSupplicantConfig.password), "%d", val ); WscSupplicantConfig.password[LONG_PIN_LEN] = '\0'; wpa_printf(MSG_INFO, "Random PIN: %c-%c-%c-%c-%c-%c-%c-%c\n", WscSupplicantConfig.password[0], WscSupplicantConfig.password[1], WscSupplicantConfig.password[2], WscSupplicantConfig.password[3], WscSupplicantConfig.password[4], WscSupplicantConfig.password[5], WscSupplicantConfig.password[6], WscSupplicantConfig.password[7]); } else { strcpy((char *)(WscSupplicantConfig.password), ssid->wsc_pin); ssid->wsc_pin[strlen(ssid->wsc_pin)] = '\0'; } break; case WSC_MODE_PBC: for (i = 0 ; i < LONG_PIN_LEN; i++) { WscSupplicantConfig.password[i] = '0'; } WscSupplicantConfig.password[LONG_PIN_LEN] = '\0'; WscSupplicantConfig.configMethods |= WSC_CONFMET_PBC; WscSupplicantConfig.devicePasswordId = WSC_DEVICEPWDID_PUSH_BTN; /* Register for 2 Min. Walk-Time PushButton timeout */ eloop_register_timeout(120, 0, wsc_supplicant_PushButtonWalktimeTimeout, NULL, NULL); break; default: wpa_printf(MSG_ERROR,"wsc_supplicant: wsc_supplicant_associate: ERROR: Incompatible Simple Config Mode received in scStartEnrollee: (%d)", WscMode); WscSupplicantConfig.smState = WSC_STATE_IDLE; } bufferCreateChunk(&probeReqBuf); wsc_supplicant_BuildProbeRequest(&WscSupplicantConfig, &probeReqBuf); wpa_drv_set_wsc_mode(h_wpa_s, WscMode, bufferGetBuf(&probeReqBuf), bufferLength(&probeReqBuf)); bufferFree(&probeReqBuf); ssid->key_mgmt = WPA_KEY_MGMT_IEEE8021X; ssid->auth_alg = WPA_AUTH_ALG_OPEN; ssid->proto = WPA_PROTO_WPA; ssid->pairwise_cipher = WPA_CIPHER_TKIP; ssid->group_cipher = WPA_CIPHER_TKIP; ssid->identity = (u8 *) strdup(ENROLLEE_ID_STRING); ssid->identity_len = strlen(ENROLLEE_ID_STRING); WscSupplicantConfig.WscMode = WscMode; WscSupplicantConfig.smState = WSC_STATE_ASSOC; }
int audioBufferOpen(int frequency, int stereo, int volume) { struct ringBuffer audioBuffer; int inFd,outFd,ctlFd,cnt,pid; int inputFinished=FALSE; int percentFull; fd_set inFdSet,outFdSet; fd_set *outFdPtr; struct timeval timeout; int filedes[2]; int controldes[2]; if (pipe(filedes) || pipe(controldes)) { perror("pipe"); exit(-1); } if ((pid=fork())!=0) { /* if we are the parent */ control_fd=controldes[1]; close(filedes[0]); buffer_fd=filedes[1]; close(controldes[0]); return(pid); /* return the pid */ } /* we are the child */ close(filedes[1]); inFd=filedes[0]; close(controldes[1]); ctlFd=controldes[0]; audioOpen(frequency,stereo,volume); outFd=getAudioFd(); initBuffer(&audioBuffer); while(1) { timeout.tv_sec=0; timeout.tv_usec=0; FD_ZERO(&inFdSet); FD_ZERO(&outFdSet); FD_SET(ctlFd,&inFdSet); FD_SET(outFd,&outFdSet); if (bufferSize(&audioBuffer)<AUSIZ) { /* is the buffer too empty */ outFdPtr = NULL; /* yes, don't try to write */ if (inputFinished) /* no more input, buffer exhausted -> exit */ break; } else outFdPtr=&outFdSet; /* no, select on write */ /* check we have at least AUSIZ bytes left (don't want <1k bits) */ if ((bufferFree(&audioBuffer)>=AUSIZ) && !inputFinished) FD_SET(inFd,&inFdSet); /* The following selects() are basically all that is left of the system dependent code outside the audioIO_*&c files. These selects really need to be moved into the audioIO_*.c files and replaced with a function like audioIOReady(inFd, &checkIn, &checkAudio, wait) where it checks the status of the input or audio output if checkIn or checkAudio are set and returns with checkIn or checkAudio set to TRUE or FALSE depending on whether or not data is available. If wait is FALSE the function should return immediately, if wait is TRUE the process should BLOCK until the required condition is met. NB: The process MUST relinquish the CPU during this check or it will gobble up all the available CPU which sort of defeats the purpose of the buffer. This is tricky for people who don't have file descriptors (and select) to do the job. In that case a buffer implemented using threads should work. The way things are set up now a threaded version shouldn't be to hard to implement. When I get some time... */ /* check if we can read or write */ if (select(MAX3(inFd,outFd,ctlFd)+1,&inFdSet,outFdPtr,NULL,NULL) > -1) { if (outFdPtr && FD_ISSET(outFd,outFdPtr)) { /* need to write */ int bytesToEnd = AUDIO_BUFFER_SIZE - audioBuffer.outPos; percentFull=100*bufferSize(&audioBuffer)/AUDIO_BUFFER_SIZE; if (AUSIZ>bytesToEnd) { cnt = audioWrite(audioBuffer.bufferPtr + audioBuffer.outPos, bytesToEnd); cnt += audioWrite(audioBuffer.bufferPtr, AUSIZ - bytesToEnd); audioBuffer.outPos = AUSIZ - bytesToEnd; } else { cnt = audioWrite(audioBuffer.bufferPtr + audioBuffer.outPos, AUSIZ); audioBuffer.outPos += AUSIZ; } } if (FD_ISSET(inFd,&inFdSet)) { /* need to read */ cnt = read(inFd, audioBuffer.bufferPtr + audioBuffer.inPos, MIN(AUSIZ, AUDIO_BUFFER_SIZE - audioBuffer.inPos)); if (cnt >= 0) { audioBuffer.inPos = (audioBuffer.inPos + cnt) % AUDIO_BUFFER_SIZE; if (cnt==0) inputFinished=TRUE; } else _exit(-1); } if (FD_ISSET(ctlFd,&inFdSet)) { int dummy; cnt = read(ctlFd, &dummy, sizeof dummy); if (cnt >= 0) { audioBuffer.inPos = audioBuffer.outPos = 0; audioFlush(); } else _exit(-1); } } else _exit(-1); } close(inFd); audioClose(); exit(0); return 0; /* just to get rid of warnings */ }
void csrmv(Param out, const Param &values, const Param &rowIdx, const Param &colIdx, const Param &rhs, const T alpha, const T beta) { bool use_alpha = (alpha != scalar<T>(1.0)); bool use_beta = (beta != scalar<T>(0.0)); // Using greedy indexing is causing performance issues on many platforms // FIXME: Figure out why bool use_greedy = false; // FIXME: Find a better number based on average non zeros per row int threads = 64; std::string ref_name = std::string("csrmv_") + std::string(dtype_traits<T>::getName()) + std::string("_") + std::to_string(use_alpha) + std::string("_") + std::to_string(use_beta) + std::string("_") + std::to_string(use_greedy) + std::string("_") + std::to_string(threads); int device = getActiveDeviceId(); kc_entry_t entry = kernelCache(device, ref_name); if (entry.prog==0 && entry.ker==0) { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName(); options << " -D USE_ALPHA=" << use_alpha; options << " -D USE_BETA=" << use_beta; options << " -D USE_GREEDY=" << use_greedy; options << " -D THREADS=" << threads; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } if (std::is_same<T, cfloat>::value || std::is_same<T, cdouble>::value) { options << " -D IS_CPLX=1"; } else { options << " -D IS_CPLX=0"; } const char *ker_strs[] = {csrmv_cl}; const int ker_lens[] = {csrmv_cl_len}; Program prog; buildProgram(prog, 1, ker_strs, ker_lens, options.str()); entry.prog = new Program(prog); entry.ker = new Kernel[2]; entry.ker[0] = Kernel(*entry.prog, "csrmv_thread"); entry.ker[1] = Kernel(*entry.prog, "csrmv_block"); addKernelToCache(device, ref_name, entry); } int count = 0; cl::Buffer *counter = bufferAlloc(sizeof(int)); getQueue().enqueueWriteBuffer(*counter, CL_TRUE, 0, sizeof(int), (void *)&count); // TODO: Figure out the proper way to choose either csrmv_thread or csrmv_block bool is_csrmv_block = true; auto csrmv_kernel = is_csrmv_block ? entry.ker[1] : entry.ker[0]; auto csrmv_func = KernelFunctor<Buffer, Buffer, Buffer, Buffer, int, Buffer, KParam, T, T, Buffer>(csrmv_kernel); NDRange local(is_csrmv_block ? threads : THREADS_PER_GROUP, 1); int M = rowIdx.info.dims[0] - 1; int groups_x = is_csrmv_block ? divup(M, REPEAT) : divup(M, REPEAT * local[0]); groups_x = std::min(groups_x, MAX_CSRMV_GROUPS); NDRange global(local[0] * groups_x, 1); csrmv_func(EnqueueArgs(getQueue(), global, local), *out.data, *values.data, *rowIdx.data, *colIdx.data, M, *rhs.data, rhs.info, alpha, beta, *counter); CL_DEBUG_FINISH(getQueue()); bufferFree(counter); }
void orb(unsigned* out_feat, Param& x_out, Param& y_out, Param& score_out, Param& ori_out, Param& size_out, Param& desc_out, Param image, const float fast_thr, const unsigned max_feat, const float scl_fctr, const unsigned levels, const bool blur_img) { try { static std::once_flag compileFlags[DeviceManager::MAX_DEVICES]; static Program orbProgs[DeviceManager::MAX_DEVICES]; static Kernel hrKernel[DeviceManager::MAX_DEVICES]; static Kernel kfKernel[DeviceManager::MAX_DEVICES]; static Kernel caKernel[DeviceManager::MAX_DEVICES]; static Kernel eoKernel[DeviceManager::MAX_DEVICES]; int device = getActiveDeviceId(); std::call_once( compileFlags[device], [device] () { std::ostringstream options; options << " -D T=" << dtype_traits<T>::getName() << " -D BLOCK_SIZE=" << ORB_THREADS_X; if (std::is_same<T, double>::value || std::is_same<T, cdouble>::value) { options << " -D USE_DOUBLE"; } buildProgram(orbProgs[device], orb_cl, orb_cl_len, options.str()); hrKernel[device] = Kernel(orbProgs[device], "harris_response"); kfKernel[device] = Kernel(orbProgs[device], "keep_features"); caKernel[device] = Kernel(orbProgs[device], "centroid_angle"); eoKernel[device] = Kernel(orbProgs[device], "extract_orb"); }); unsigned patch_size = REF_PAT_SIZE; unsigned min_side = std::min(image.info.dims[0], image.info.dims[1]); unsigned max_levels = 0; float scl_sum = 0.f; for (unsigned i = 0; i < levels; i++) { min_side /= scl_fctr; // Minimum image side for a descriptor to be computed if (min_side < patch_size || max_levels == levels) break; max_levels++; scl_sum += 1.f / (float)pow(scl_fctr,(float)i); } std::vector<cl::Buffer*> d_x_pyr(max_levels); std::vector<cl::Buffer*> d_y_pyr(max_levels); std::vector<cl::Buffer*> d_score_pyr(max_levels); std::vector<cl::Buffer*> d_ori_pyr(max_levels); std::vector<cl::Buffer*> d_size_pyr(max_levels); std::vector<cl::Buffer*> d_desc_pyr(max_levels); std::vector<unsigned> feat_pyr(max_levels); unsigned total_feat = 0; // Compute number of features to keep for each level std::vector<unsigned> lvl_best(max_levels); unsigned feat_sum = 0; for (unsigned i = 0; i < max_levels-1; i++) { float lvl_scl = (float)pow(scl_fctr,(float)i); lvl_best[i] = ceil((max_feat / scl_sum) / lvl_scl); feat_sum += lvl_best[i]; } lvl_best[max_levels-1] = max_feat - feat_sum; // Maintain a reference to previous level image Param prev_img; Param lvl_img; const unsigned gauss_len = 9; T* h_gauss = nullptr; Param gauss_filter; gauss_filter.data = nullptr; for (unsigned i = 0; i < max_levels; i++) { const float lvl_scl = (float)pow(scl_fctr,(float)i); if (i == 0) { // First level is used in its original size lvl_img = image; prev_img = image; } else if (i > 0) { // Resize previous level image to current level dimensions lvl_img.info.dims[0] = round(image.info.dims[0] / lvl_scl); lvl_img.info.dims[1] = round(image.info.dims[1] / lvl_scl); lvl_img.info.strides[0] = 1; lvl_img.info.strides[1] = lvl_img.info.dims[0]; for (int k = 2; k < 4; k++) { lvl_img.info.dims[k] = 1; lvl_img.info.strides[k] = lvl_img.info.dims[k - 1] * lvl_img.info.strides[k - 1]; } lvl_img.info.offset = 0; lvl_img.data = bufferAlloc(lvl_img.info.dims[3] * lvl_img.info.strides[3] * sizeof(T)); resize<T, AF_INTERP_BILINEAR>(lvl_img, prev_img); if (i > 1) bufferFree(prev_img.data); prev_img = lvl_img; } unsigned lvl_feat = 0; Param d_x_feat, d_y_feat, d_score_feat; // Round feature size to nearest odd integer float size = 2.f * floor(patch_size / 2.f) + 1.f; // Avoid keeping features that might be too wide and might not fit on // the image, sqrt(2.f) is the radius when angle is 45 degrees and // represents widest case possible unsigned edge = ceil(size * sqrt(2.f) / 2.f); // Detect FAST features fast<T, 9, true>(&lvl_feat, d_x_feat, d_y_feat, d_score_feat, lvl_img, fast_thr, 0.15f, edge); if (lvl_feat == 0) { feat_pyr[i] = 0; if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); continue; } bufferFree(d_score_feat.data); unsigned usable_feat = 0; cl::Buffer* d_usable_feat = bufferAlloc(sizeof(unsigned)); getQueue().enqueueWriteBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat); cl::Buffer* d_x_harris = bufferAlloc(lvl_feat * sizeof(float)); cl::Buffer* d_y_harris = bufferAlloc(lvl_feat * sizeof(float)); cl::Buffer* d_score_harris = bufferAlloc(lvl_feat * sizeof(float)); // Calculate Harris responses // Good block_size >= 7 (must be an odd number) const dim_type blk_x = divup(lvl_feat, ORB_THREADS_X); const NDRange local(ORB_THREADS_X, ORB_THREADS_Y); const NDRange global(blk_x * ORB_THREADS_X, ORB_THREADS_Y); unsigned block_size = 7; float k_thr = 0.04f; auto hrOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, const unsigned, Buffer, Buffer, KParam, const unsigned, const float, const unsigned> (hrKernel[device]); hrOp(EnqueueArgs(getQueue(), global, local), *d_x_harris, *d_y_harris, *d_score_harris, *d_x_feat.data, *d_y_feat.data, lvl_feat, *d_usable_feat, *lvl_img.data, lvl_img.info, block_size, k_thr, patch_size); CL_DEBUG_FINISH(getQueue()); getQueue().enqueueReadBuffer(*d_usable_feat, CL_TRUE, 0, sizeof(unsigned), &usable_feat); bufferFree(d_x_feat.data); bufferFree(d_y_feat.data); bufferFree(d_usable_feat); if (usable_feat == 0) { feat_pyr[i] = 0; bufferFree(d_x_harris); bufferFree(d_y_harris); bufferFree(d_score_harris); if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); continue; } // Sort features according to Harris responses Param d_harris_sorted; Param d_harris_idx; d_harris_sorted.info.dims[0] = usable_feat; d_harris_idx.info.dims[0] = usable_feat; d_harris_sorted.info.strides[0] = 1; d_harris_idx.info.strides[0] = 1; for (int k = 1; k < 4; k++) { d_harris_sorted.info.dims[k] = 1; d_harris_idx.info.dims[k] = 1; d_harris_sorted.info.strides[k] = d_harris_sorted.info.dims[k - 1] * d_harris_sorted.info.strides[k - 1]; d_harris_idx.info.strides[k] = d_harris_idx.info.dims[k - 1] * d_harris_idx.info.strides[k - 1]; } d_harris_sorted.info.offset = 0; d_harris_idx.info.offset = 0; d_harris_sorted.data = d_score_harris; d_harris_idx.data = bufferAlloc((d_harris_idx.info.dims[0]) * sizeof(unsigned)); sort0_index<float, false>(d_harris_sorted, d_harris_idx); cl::Buffer* d_x_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_y_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_score_lvl = bufferAlloc(usable_feat * sizeof(float)); usable_feat = min(usable_feat, lvl_best[i]); // Keep only features with higher Harris responses const dim_type keep_blk = divup(usable_feat, ORB_THREADS); const NDRange local_keep(ORB_THREADS, 1); const NDRange global_keep(keep_blk * ORB_THREADS, 1); auto kfOp = make_kernel<Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, Buffer, const unsigned> (kfKernel[device]); kfOp(EnqueueArgs(getQueue(), global_keep, local_keep), *d_x_lvl, *d_y_lvl, *d_score_lvl, *d_x_harris, *d_y_harris, *d_harris_sorted.data, *d_harris_idx.data, usable_feat); CL_DEBUG_FINISH(getQueue()); bufferFree(d_x_harris); bufferFree(d_y_harris); bufferFree(d_harris_sorted.data); bufferFree(d_harris_idx.data); cl::Buffer* d_ori_lvl = bufferAlloc(usable_feat * sizeof(float)); cl::Buffer* d_size_lvl = bufferAlloc(usable_feat * sizeof(float)); // Compute orientation of features const dim_type centroid_blk_x = divup(usable_feat, ORB_THREADS_X); const NDRange local_centroid(ORB_THREADS_X, ORB_THREADS_Y); const NDRange global_centroid(centroid_blk_x * ORB_THREADS_X, ORB_THREADS_Y); auto caOp = make_kernel<Buffer, Buffer, Buffer, const unsigned, Buffer, KParam, const unsigned> (caKernel[device]); caOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_x_lvl, *d_y_lvl, *d_ori_lvl, usable_feat, *lvl_img.data, lvl_img.info, patch_size); CL_DEBUG_FINISH(getQueue()); Param lvl_filt; Param lvl_tmp; if (blur_img) { lvl_filt = lvl_img; lvl_tmp = lvl_img; lvl_filt.data = bufferAlloc(lvl_filt.info.dims[0] * lvl_filt.info.dims[1] * sizeof(T)); lvl_tmp.data = bufferAlloc(lvl_tmp.info.dims[0] * lvl_tmp.info.dims[1] * sizeof(T)); // Calculate a separable Gaussian kernel if (h_gauss == nullptr) { h_gauss = new T[gauss_len]; gaussian1D(h_gauss, gauss_len, 2.f); gauss_filter.info.dims[0] = gauss_len; gauss_filter.info.strides[0] = 1; for (int k = 1; k < 4; k++) { gauss_filter.info.dims[k] = 1; gauss_filter.info.strides[k] = gauss_filter.info.dims[k - 1] * gauss_filter.info.strides[k - 1]; } dim_type gauss_elem = gauss_filter.info.strides[3] * gauss_filter.info.dims[3]; gauss_filter.data = bufferAlloc(gauss_elem * sizeof(T)); getQueue().enqueueWriteBuffer(*gauss_filter.data, CL_TRUE, 0, gauss_elem * sizeof(T), h_gauss); } // Filter level image with Gaussian kernel to reduce noise sensitivity convolve2<T, convAccT, 0, false, gauss_len>(lvl_tmp, lvl_img, gauss_filter); convolve2<T, convAccT, 1, false, gauss_len>(lvl_filt, lvl_tmp, gauss_filter); bufferFree(lvl_tmp.data); } // Compute ORB descriptors cl::Buffer* d_desc_lvl = bufferAlloc(usable_feat * 8 * sizeof(unsigned)); unsigned* h_desc_lvl = new unsigned[usable_feat * 8]; for (int j = 0; j < (int)usable_feat * 8; j++) h_desc_lvl[j] = 0; getQueue().enqueueWriteBuffer(*d_desc_lvl, CL_TRUE, 0, usable_feat * 8 * sizeof(unsigned), h_desc_lvl); delete[] h_desc_lvl; auto eoOp = make_kernel<Buffer, const unsigned, Buffer, Buffer, Buffer, Buffer, Buffer, KParam, const float, const unsigned> (eoKernel[device]); if (blur_img) { eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_desc_lvl, usable_feat, *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl, *lvl_filt.data, lvl_filt.info, lvl_scl, patch_size); CL_DEBUG_FINISH(getQueue()); bufferFree(lvl_filt.data); } else { eoOp(EnqueueArgs(getQueue(), global_centroid, local_centroid), *d_desc_lvl, usable_feat, *d_x_lvl, *d_y_lvl, *d_ori_lvl, *d_size_lvl, *lvl_img.data, lvl_img.info, lvl_scl, patch_size); CL_DEBUG_FINISH(getQueue()); } // Store results to pyramids total_feat += usable_feat; feat_pyr[i] = usable_feat; d_x_pyr[i] = d_x_lvl; d_y_pyr[i] = d_y_lvl; d_score_pyr[i] = d_score_lvl; d_ori_pyr[i] = d_ori_lvl; d_size_pyr[i] = d_size_lvl; d_desc_pyr[i] = d_desc_lvl; if (i > 0 && i == max_levels-1) bufferFree(lvl_img.data); } if (gauss_filter.data != nullptr) bufferFree(gauss_filter.data); if (h_gauss != nullptr) delete[] h_gauss; // If no features are found, set found features to 0 and return if (total_feat == 0) { *out_feat = 0; return; } // Allocate output memory x_out.info.dims[0] = total_feat; x_out.info.strides[0] = 1; y_out.info.dims[0] = total_feat; y_out.info.strides[0] = 1; score_out.info.dims[0] = total_feat; score_out.info.strides[0] = 1; ori_out.info.dims[0] = total_feat; ori_out.info.strides[0] = 1; size_out.info.dims[0] = total_feat; size_out.info.strides[0] = 1; desc_out.info.dims[0] = 8; desc_out.info.strides[0] = 1; desc_out.info.dims[1] = total_feat; desc_out.info.strides[1] = desc_out.info.dims[0]; for (int k = 1; k < 4; k++) { x_out.info.dims[k] = 1; x_out.info.strides[k] = x_out.info.dims[k - 1] * x_out.info.strides[k - 1]; y_out.info.dims[k] = 1; y_out.info.strides[k] = y_out.info.dims[k - 1] * y_out.info.strides[k - 1]; score_out.info.dims[k] = 1; score_out.info.strides[k] = score_out.info.dims[k - 1] * score_out.info.strides[k - 1]; ori_out.info.dims[k] = 1; ori_out.info.strides[k] = ori_out.info.dims[k - 1] * ori_out.info.strides[k - 1]; size_out.info.dims[k] = 1; size_out.info.strides[k] = size_out.info.dims[k - 1] * size_out.info.strides[k - 1]; if (k > 1) { desc_out.info.dims[k] = 1; desc_out.info.strides[k] = desc_out.info.dims[k - 1] * desc_out.info.strides[k - 1]; } } if (total_feat > 0) { size_t out_sz = total_feat * sizeof(float); x_out.data = bufferAlloc(out_sz); y_out.data = bufferAlloc(out_sz); score_out.data = bufferAlloc(out_sz); ori_out.data = bufferAlloc(out_sz); size_out.data = bufferAlloc(out_sz); size_t desc_sz = total_feat * 8 * sizeof(unsigned); desc_out.data = bufferAlloc(desc_sz); } unsigned offset = 0; for (unsigned i = 0; i < max_levels; i++) { if (feat_pyr[i] == 0) continue; if (i > 0) offset += feat_pyr[i-1]; getQueue().enqueueCopyBuffer(*d_x_pyr[i], *x_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_y_pyr[i], *y_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_score_pyr[i], *score_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_ori_pyr[i], *ori_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_size_pyr[i], *size_out.data, 0, offset*sizeof(float), feat_pyr[i] * sizeof(float)); getQueue().enqueueCopyBuffer(*d_desc_pyr[i], *desc_out.data, 0, offset*8*sizeof(unsigned), feat_pyr[i] * 8 * sizeof(unsigned)); bufferFree(d_x_pyr[i]); bufferFree(d_y_pyr[i]); bufferFree(d_score_pyr[i]); bufferFree(d_ori_pyr[i]); bufferFree(d_size_pyr[i]); bufferFree(d_desc_pyr[i]); } // Sets number of output features *out_feat = total_feat; } catch (cl::Error err) { CL_TO_AF_ERROR(err); throw; } }
bool rtspRequestSend(RTSPRequest *rtspRequest, char *url, NetworkConnection *networkConnection) { uint8_t *requestBuffer; size_t maxRequestBufferSize; int charsWritten; /* Create buffer for full request (optimizer will get rid of all the individual constants) */ maxRequestBufferSize = 12 /* "%s %s RTSP/1.0\r\n" printable characters */ + MAX_COMMAND_STRING_SIZE /* command (first "%s" above) */ + MAX_URL_STRING_SIZE /* url (second "%s" above) */ + rtspRequest->headerBufferSize - 1 /* header (excluding the terminating '\0') */ + 2 /* CR/LF */ + rtspRequest->contentBufferSize; /* content */ if(!bufferAllocate(&requestBuffer, maxRequestBufferSize, "RTSP request buffer")) { return false; } /* Write command */ charsWritten = snprintf((char *)requestBuffer, maxRequestBufferSize, "%s %s RTSP/1.0\r\n", METHOD_NAMES[rtspRequest->requestMethod], rtspRequest->requestMethod == RTSP_METHOD_OPTIONS ? "*" : url); if(charsWritten < 0) { logWrite(LOG_LEVEL_ERROR, LOG_COMPONENT_NAME, "Cannot write command to request buffer."); bufferFree(&requestBuffer); return false; } /* Validate if amount of buffer is still enough (see explanation above for the following calculation) */ if(charsWritten + rtspRequest->headerBufferSize - 1 + 2 + rtspRequest->contentBufferSize > maxRequestBufferSize) { logWrite(LOG_LEVEL_ERROR, LOG_COMPONENT_NAME, "Request buffer is not big enough to hold header and content."); bufferFree(&requestBuffer); return false; } /* Write header fields */ if(rtspRequest->headerBuffer != NULL) { memcpy(requestBuffer + charsWritten, rtspRequest->headerBuffer, rtspRequest->headerBufferSize - 1); /* No need for '\0' so -1 */ charsWritten += rtspRequest->headerBufferSize - 1; } /* Write header/content separator (length validation done above) */ requestBuffer[charsWritten] = '\r'; charsWritten++; requestBuffer[charsWritten] = '\n'; charsWritten++; /* Write content */ if(rtspRequest->contentBuffer != NULL) { memcpy(requestBuffer + charsWritten, rtspRequest->contentBuffer, rtspRequest->contentBufferSize); charsWritten += rtspRequest->contentBufferSize; } /* Send out request */ if(!networkSendMessage(networkConnection, requestBuffer, charsWritten)) { bufferFree(&requestBuffer); return false; } /* Write info from this message */ logWrite(LOG_LEVEL_DEBUG, LOG_COMPONENT_NAME, "Sent out RTSP request:\n%.*s", charsWritten, requestBuffer); /* Free up resources */ if(!bufferFree(&requestBuffer)) { return false; } return true; }