void vec4_generator::generate_gs_set_write_offset(struct brw_reg dst, struct brw_reg src0, struct brw_reg src1) { /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message * Header: M0.3): * * Slot 0 Offset. This field, after adding to the Global Offset field * in the message descriptor, specifies the offset (in 256-bit units) * from the start of the URB entry, as referenced by URB Handle 0, at * which the data will be accessed. * * Similar text describes DWORD M0.4, which is slot 1 offset. * * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components * of the register for geometry shader invocations 0 and 1) by the * immediate value in src1, and store the result in DWORDs 3 and 4 of dst. * * We can do this with the following EU instruction: * * mul(2) dst.3<1>UD src0<8;2,4>UD src1 { Align1 WE_all } */ brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4), src1); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); }
void vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst, struct brw_reg src) { brw_push_insn_state(p); brw_set_access_mode(p, BRW_ALIGN_1); brw_set_mask_control(p, BRW_MASK_DISABLE); /* If we think of the src and dst registers as composed of 8 DWORDs each, * we want to pick up the contents of DWORDs 0 and 4 from src, truncate * them to WORDs, and then pack them into DWORD 2 of dst. * * It's easier to get the EU to do this if we think of the src and dst * registers as composed of 16 WORDS each; then, we want to pick up the * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 of * dst. * * We can do that by the following EU instruction: * * mov (2) dst.4<1>:uw src<8;1,0>:uw { Align1, Q1, NoMask } */ brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4), stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0)); brw_set_access_mode(p, BRW_ALIGN_16); brw_pop_insn_state(p); }
static void emit_pixel_xy(struct brw_compile *p, const struct brw_reg *dst, GLuint mask, const struct brw_reg *arg0) { struct brw_reg r1 = brw_vec1_grf(1, 0); struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); brw_set_compression_control(p, BRW_COMPRESSION_NONE); /* Calculate pixel centers by adding 1 or 0 to each of the * micro-tile coordinates passed in r1. */ if (mask & WRITEMASK_X) { brw_ADD(p, vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)), stride(suboffset(r1_uw, 4), 2, 4, 0), brw_imm_v(0x10101010)); } if (mask & WRITEMASK_Y) { brw_ADD(p, vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)), stride(suboffset(r1_uw,5), 2, 4, 0), brw_imm_v(0x11001100)); } brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED); }
void stencilBlitzStencilVersion(BenchmarkExt<int>& bench) { bench.beginImplementation("Blitz++ Stencil"); while (!bench.doneImplementationBenchmark()) { int N = bench.getParameter(); cout << "Blitz++ Stencil: N = " << N << endl; cout.flush(); long iters = bench.getIterations(); Array<double,3> A(N,N,N), B(N,N,N); initializeRandomDouble(A.data(), N*N*N, A.stride(thirdDim)); initializeRandomDouble(B.data(), N*N*N, B.stride(thirdDim)); TinyVector<int,2> size = N-2; generateFastTraversalOrder(size); double c = 1/7.; ; bench.start(); for (long i=0; i < iters; ++i) { Range I(1,N-2), J(1,N-2), K(1,N-2); applyStencil(test1stencil(),A,B); applyStencil(test1stencil(),B,A); } bench.stop(); } bench.endImplementation(); }
static void emit_pixel_xy(struct brw_wm_compile *c, struct prog_instruction *inst) { struct brw_reg r1 = brw_vec1_grf(1, 0); struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW); struct brw_reg dst0, dst1; struct brw_compile *p = &c->func; GLuint mask = inst->DstReg.WriteMask; dst0 = get_dst_reg(c, inst, 0, 1); dst1 = get_dst_reg(c, inst, 1, 1); /* Calculate pixel centers by adding 1 or 0 to each of the * micro-tile coordinates passed in r1. */ if (mask & WRITEMASK_X) { brw_ADD(p, vec8(retype(dst0, BRW_REGISTER_TYPE_UW)), stride(suboffset(r1_uw, 4), 2, 4, 0), brw_imm_v(0x10101010)); } if (mask & WRITEMASK_Y) { brw_ADD(p, vec8(retype(dst1, BRW_REGISTER_TYPE_UW)), stride(suboffset(r1_uw, 5), 2, 4, 0), brw_imm_v(0x11001100)); } }
static unsigned long size_yuv420(int w, int h) { unsigned yPitch = stride(w); return h * (yPitch + (yPitch >> 1)); }
void Data::print() const{ printf("%p + %d, (%d", mData, offset(), size(0)); for(int i=1;i<highestDim(*this);++i){ printf(",%d", size(i)); } printf("):%+d, %s %s\n", stride(), typeToString(type()).c_str(), toToken().c_str()); }
void const* ms::GLPixelBuffer::as_argb_8888() { if (pixels_need_y_flip) { auto const stride_val = stride().as_uint32_t(); auto const height = size_.height.as_uint32_t(); std::vector<char> tmp(stride_val); for (unsigned int i = 0; i < height / 2; i++) { /* Store line i */ tmp.assign(&pixels[i * stride_val], &pixels[(i + 1) * stride_val]); /* Copy line height - i - 1 to line i */ copy_and_convert_pixel_line(&pixels[(height - i - 1) * stride_val], &pixels[i * stride_val]); /* Copy stored line (i) to height - i - 1 */ copy_and_convert_pixel_line(tmp.data(), &pixels[(height - i - 1) * stride_val]); } /* Process middle line if there is one */ if (height % 2 == 1) { copy_and_convert_pixel_line(&pixels[(height / 2) * stride_val], &pixels[(height / 2) * stride_val]); } pixels_need_y_flip = false; } return pixels.data(); }
Image& Image::flip() { // 1. パラメータチェック { if (isEmpty()) { return *this; } } // 2. 処理 { const int32 h = m_height, s = stride(); Array<Color> line(m_width); Color* lineU = m_data.data(); Color* lineB = lineU + m_width * (h - 1); for (int32 y = 0; y < h / 2; ++y) { ::memcpy(line.data(), lineU, s); ::memcpy(lineU, lineB, s); ::memcpy(lineB, line.data(), s); lineU += m_width; lineB -= m_width; } } return *this; }
bool DenseMatrixBase::robustSolve(double* rhs, double* const workspace) const { cadet_assert(_rows == _cols); // For LAPACK the matrix looks like it's transposed. We, thus, // work with the transposed (i.e., the original) matrix. // From LAPACK's point of view, we want to solve A^T * x = y with the factorization A = L * Q. // The solution is given by x = L^{-T} * Q * y. char side[] = "L"; char transQ[] = "N"; // Since LAPACK uses column-major storage and we use row-major, // we actually have constructed the transposed matrix. lapackInt_t n = _rows; lapackInt_t m = _cols; lapackInt_t nrhs = 1; lapackInt_t lda = stride(); lapackInt_t flag = 0; // Calculate z = Q * y LapackMultiplyFactorizedQ(side, transQ, &m, &nrhs, &m, _data, &lda, workspace, rhs, &n, workspace + _rows, &n, &flag); if (flag != 0) return false; // Calculate x = L^{-T} * Q * y = L^{-T} * z char transL[] = "T"; LapackSolveTriangular(side, transL, transQ, &m, &nrhs, _data, &lda, rhs, &n, &flag); return flag == 0; }
VectorRecord VectorRecord::slice( Nullable<int64_t> nLow, Nullable<int64_t> nHigh, Nullable<int64_t> nStride ) const { if (!mDataPtr) return VectorRecord(); if (nStride && *nStride == 0) return VectorRecord(); IntegerSequence sequenceToUse = IntegerSequence(size(), offset(), stride()).slice(nLow, nHigh, nStride); if (!sequenceToUse.size()) return VectorRecord(); return VectorRecord( mDataPtr, sequenceToUse.size(), sequenceToUse.offset(), sequenceToUse.stride() ); }
status_t LayerBitmap::getInfo(surface_info_t* info) const { if (mSurface.data == 0) { memset(info, 0, sizeof(surface_info_t)); info->bits_offset = NO_MEMORY; return NO_MEMORY; } info->w = uint16_t(width()); info->h = uint16_t(height()); info->stride= uint16_t(stride()); info->bpr = uint16_t(stride() * bytesPerPixel(pixelFormat())); info->format= uint8_t(pixelFormat()); info->flags = surface_info_t::eBufferDirty; info->bits_offset = ssize_t(mOffset); return NO_ERROR; }
bool VectorRecord::allValuesAreLoaded() const { if (!dataPtr() || !dataPtr()->pagedAndPageletTreeValueCount()) return true; IntegerSequence curSlice(size(), offset(), stride()); IntegerSequence restrictedSlice = curSlice.intersect(IntegerSequence(pagedAndPageletTreeValueCount())); if (restrictedSlice.size() == 0) return true; Nullable<long> slotIndex; Fora::Interpreter::ExecutionContext* context = Fora::Interpreter::ExecutionContext::currentExecutionContext(); if (context) slotIndex = context->getCurrentBigvecSlotIndex(); else slotIndex = 0; lassert(slotIndex); if (!dataPtr()->bigvecHandleForSlot(*slotIndex)) return false; bool tr = dataPtr()-> bigvecHandleForSlot(*slotIndex)->allValuesAreLoadedBetween( restrictedSlice.smallestValue(), restrictedSlice.largestValue() + 1 ); return tr; }
void DenseMatrixBase::submatrixMultiplyVector(const double* const x, unsigned int startRow, unsigned int startCol, unsigned int numRows, unsigned int numCols, double alpha, double beta, double* const y) const { cadet_assert(_rows > startRow); cadet_assert(_cols > startCol); cadet_assert(_rows >= startRow + numRows); cadet_assert(_cols >= startCol + numCols); // Since LAPACK uses column-major storage and we use row-major, // we actually have constructed the transposed matrix. Thus, // rows and columns interchange. lapackInt_t m = numCols; lapackInt_t n = numRows; lapackInt_t lda = stride(); lapackInt_t inc = 1; // Stride in vectors (here, elements are continuous without intermediate space) // For LAPACK the matrix looks like it's transposed. We, thus, // multiply with the transposed matrix, which in the end uses the original matrix. char trans[] = "T"; // Pointer to first entry of submatrix double* const data = const_cast<double*>(_data) + startRow * lda + startCol; // LAPACK computes y <- alpha * A * x + beta * y LapackMultiplyDense(trans, &m, &n, &alpha, data, &lda, const_cast<double*>(x), &inc, &beta, const_cast<double*>(y), &inc); }
static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op) { int ftid = flattenedThreadId(); int sft = stride(); if (sft < n) { for (unsigned int i = sft + ftid; i < n; i += sft) data[ftid] = op(data[ftid], data[i]); __syncthreads(); n = sft; } while (n > 1) { unsigned int half = n/2; if (ftid < half) data[ftid] = op(data[ftid], data[n - ftid - 1]); __syncthreads(); n = n - half; } }
int DenseMatrixBase::optimalLeastSquaresWorkspace() const { cadet_assert(_rows >= _cols); // For LAPACK the matrix looks like it's transposed. We, thus, // solve the transposed equation which uses the original matrix. char trans[] = "T"; // Since LAPACK uses column-major storage and we use row-major, // we actually have constructed the transposed matrix. lapackInt_t n = _rows; lapackInt_t m = _cols; lapackInt_t nrhs = 1; lapackInt_t lda = stride(); lapackInt_t lwork = -1; lapackInt_t flag = 0; double work = 0.0; LapackDenseLeastSquares(trans, &m, &n, &nrhs, const_cast<double*>(_data), &lda, nullptr, &n, &work, &lwork, &flag); if (flag != 0) return -1; return static_cast<int>(work); }
namespace Rfit { constexpr uint32_t maxNumberOfTracks() { return 5*1024; } constexpr uint32_t stride() { return maxNumberOfTracks();} // hits template<int N> using Matrix3xNd = Eigen::Matrix<double,3,N>; template<int N> using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >; // errors template<int N> using Matrix6xNf = Eigen::Matrix<float,6,N>; template<int N> using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >; // fast fit using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >; }
void MatrixView::set_mem_to(const double val) { for (size_t c = 0; c < cols_mem_; ++c) { size_t offset = c * stride(); for (size_t r = 0; r < rows_mem_; ++r) data_[r + offset] = val; } }
static __device__ __forceinline__ void fill(It beg, It end, const T& value) { int STRIDE = stride(); It t = beg + flattenedThreadId(); for(; t < end; t += STRIDE) *t = value; }
TypedFora::Abi::ForaValueArraySlice VectorRecord::sliceForOffset(int64_t index) const { lassert(mDataPtr); return mDataPtr->sliceForOffset(index * mStride + mOffset).compose( RangeToIntegerSequence(0, size(), offset(), stride()) ); }
static std::vector<int64_t> computeLinearStride(const Tensor & tensor) { // computes the stride as if tensor were contigous auto sizes = tensor.sizes(); std::vector<int64_t> stride(tensor.dim()); stride[tensor.dim() - 1] = 1; std::partial_sum(sizes.rbegin(), sizes.rend() - 1, stride.rbegin() + 1, std::multiplies<int64_t>()); return stride; }
// set array pointers in VAO void set_pointer(size_t i, GLuint index) { size_t estride = stride(); size_t offset = 0; for (size_t n = 0; n < i*2; n += 2) offset += _format[n + 1] * sizeof(float); glVertexAttribPointer(index, _format[i*2 + 1], GL_FLOAT, GL_FALSE, estride*sizeof(float), (const void*)offset); glEnableVertexAttribArray(i); }
MatrixView& MatrixView::operator*=(const double val) { for (size_t c = 0; c < cols_; ++c) { size_t offset = c * stride(); for (size_t r = 0; r < rows_; ++r) data_[r + offset] *= val; } return *this; }
void DenseMatrixBase::scaleColumns(double const* scalingFactors, unsigned int numCols) { const unsigned int ld = stride(); for (unsigned int i = 0; i < _rows; ++i) { for (unsigned int j = 0; j < numCols; ++j) _data[i * ld + j] /= scalingFactors[j]; } }
void DenseMatrixBase::submatrixSetAll(double val, unsigned int startRow, unsigned int startCol, unsigned int numRows, unsigned int numCols) { cadet_assert(_rows > startRow); cadet_assert(_cols > startCol); cadet_assert(_rows >= startRow + numRows); cadet_assert(_cols >= startCol + numCols); double* const ptrDest = _data + startRow * stride() + startCol; for (unsigned int i = 0; i < numRows; ++i) { for (unsigned int j = 0; j < numCols; ++j) { ptrDest[i * stride() + j] = val; } } }
static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value) { int STRIDE = stride(); int tid = flattenedThreadId(); value += tid; for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE) *t = value; }
void DenseMatrixBase::scaleRows(double const* scalingFactors, unsigned int numRows) { const unsigned int ld = stride(); for (unsigned int i = 0; i < numRows; ++i) { for (unsigned int j = 0; j < _cols; ++j) _data[i * ld + j] /= scalingFactors[i]; } }
static __device__ __forceinline__ void transfrom(InIt beg, InIt end, OutIt out, UnOp op) { int STRIDE = stride(); InIt t = beg + flattenedThreadId(); OutIt o = out + (t - beg); for(; t < end; t += STRIDE, o += STRIDE) *o = op(*t); }
static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out) { int STRIDE = stride(); InIt t = beg + flattenedThreadId(); OutIt o = out + (t - beg); for(; t < end; t += STRIDE, o += STRIDE) *o = *t; }
void VolumeModel::bindAttributeArrays(QOpenGLShaderProgram * program) const { QMutexLocker locker (&modelMutex); program->enableAttributeArray(attributeArrays["vertex"]); program->setAttributeBuffer(attributeArrays["vertex"], GL_FLOAT, 0, 3, stride()); program->enableAttributeArray(attributeArrays["tex"]); program->setAttributeBuffer(attributeArrays["tex"], GL_FLOAT, sizeof(GLfloat) * 3, 3, stride()); }