Beispiel #1
0
void
vec4_generator::generate_gs_set_write_offset(struct brw_reg dst,
        struct brw_reg src0,
        struct brw_reg src1)
{
    /* From p22 of volume 4 part 2 of the Ivy Bridge PRM (2.4.3.1 Message
     * Header: M0.3):
     *
     *     Slot 0 Offset. This field, after adding to the Global Offset field
     *     in the message descriptor, specifies the offset (in 256-bit units)
     *     from the start of the URB entry, as referenced by URB Handle 0, at
     *     which the data will be accessed.
     *
     * Similar text describes DWORD M0.4, which is slot 1 offset.
     *
     * Therefore, we want to multiply DWORDs 0 and 4 of src0 (the x components
     * of the register for geometry shader invocations 0 and 1) by the
     * immediate value in src1, and store the result in DWORDs 3 and 4 of dst.
     *
     * We can do this with the following EU instruction:
     *
     *     mul(2) dst.3<1>UD src0<8;2,4>UD src1   { Align1 WE_all }
     */
    brw_push_insn_state(p);
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_set_mask_control(p, BRW_MASK_DISABLE);
    brw_MUL(p, suboffset(stride(dst, 2, 2, 1), 3), stride(src0, 8, 2, 4),
            src1);
    brw_set_access_mode(p, BRW_ALIGN_16);
    brw_pop_insn_state(p);
}
Beispiel #2
0
void
vec4_generator::generate_gs_set_vertex_count(struct brw_reg dst,
        struct brw_reg src)
{
    brw_push_insn_state(p);
    brw_set_access_mode(p, BRW_ALIGN_1);
    brw_set_mask_control(p, BRW_MASK_DISABLE);

    /* If we think of the src and dst registers as composed of 8 DWORDs each,
     * we want to pick up the contents of DWORDs 0 and 4 from src, truncate
     * them to WORDs, and then pack them into DWORD 2 of dst.
     *
     * It's easier to get the EU to do this if we think of the src and dst
     * registers as composed of 16 WORDS each; then, we want to pick up the
     * contents of WORDs 0 and 8 from src, and pack them into WORDs 4 and 5 of
     * dst.
     *
     * We can do that by the following EU instruction:
     *
     *     mov (2) dst.4<1>:uw src<8;1,0>:uw   { Align1, Q1, NoMask }
     */
    brw_MOV(p, suboffset(stride(retype(dst, BRW_REGISTER_TYPE_UW), 2, 2, 1), 4),
            stride(retype(src, BRW_REGISTER_TYPE_UW), 8, 1, 0));
    brw_set_access_mode(p, BRW_ALIGN_16);
    brw_pop_insn_state(p);
}
static void emit_pixel_xy(struct brw_compile *p,
			  const struct brw_reg *dst,
			  GLuint mask,
			  const struct brw_reg *arg0)
{
   struct brw_reg r1 = brw_vec1_grf(1, 0);
   struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);

   brw_set_compression_control(p, BRW_COMPRESSION_NONE);

   /* Calculate pixel centers by adding 1 or 0 to each of the
    * micro-tile coordinates passed in r1.
    */
   if (mask & WRITEMASK_X) {
      brw_ADD(p,
	      vec16(retype(dst[0], BRW_REGISTER_TYPE_UW)),
	      stride(suboffset(r1_uw, 4), 2, 4, 0),
	      brw_imm_v(0x10101010));
   }

   if (mask & WRITEMASK_Y) {
      brw_ADD(p,
	      vec16(retype(dst[1], BRW_REGISTER_TYPE_UW)),
	      stride(suboffset(r1_uw,5), 2, 4, 0),
	      brw_imm_v(0x11001100));
   }

   brw_set_compression_control(p, BRW_COMPRESSION_COMPRESSED);
}
Beispiel #4
0
void stencilBlitzStencilVersion(BenchmarkExt<int>& bench)
{
    bench.beginImplementation("Blitz++ Stencil");

    while (!bench.doneImplementationBenchmark())
    {
        int N = bench.getParameter();

        cout << "Blitz++ Stencil: N = " << N << endl;
        cout.flush();

        long iters = bench.getIterations();

        Array<double,3> A(N,N,N), B(N,N,N);
        initializeRandomDouble(A.data(), N*N*N, A.stride(thirdDim));
        initializeRandomDouble(B.data(), N*N*N, B.stride(thirdDim));
        TinyVector<int,2> size = N-2;
        generateFastTraversalOrder(size);
        double c = 1/7.;
       
	;        bench.start();
        for (long i=0; i < iters; ++i)
        {
            Range I(1,N-2), J(1,N-2), K(1,N-2);
	    applyStencil(test1stencil(),A,B);
	    applyStencil(test1stencil(),B,A);
        }
        bench.stop();
    }

    bench.endImplementation();
}
static void emit_pixel_xy(struct brw_wm_compile *c,
		struct prog_instruction *inst)
{
    struct brw_reg r1 = brw_vec1_grf(1, 0);
    struct brw_reg r1_uw = retype(r1, BRW_REGISTER_TYPE_UW);

    struct brw_reg dst0, dst1;
    struct brw_compile *p = &c->func;
    GLuint mask = inst->DstReg.WriteMask;

    dst0 = get_dst_reg(c, inst, 0, 1);
    dst1 = get_dst_reg(c, inst, 1, 1);
    /* Calculate pixel centers by adding 1 or 0 to each of the
     * micro-tile coordinates passed in r1.
     */
    if (mask & WRITEMASK_X) {
	brw_ADD(p,
		vec8(retype(dst0, BRW_REGISTER_TYPE_UW)),
		stride(suboffset(r1_uw, 4), 2, 4, 0),
		brw_imm_v(0x10101010));
    }

    if (mask & WRITEMASK_Y) {
	brw_ADD(p,
		vec8(retype(dst1, BRW_REGISTER_TYPE_UW)),
		stride(suboffset(r1_uw, 5), 2, 4, 0),
		brw_imm_v(0x11001100));
    }

}
static unsigned long
size_yuv420(int w, int h)
{
    unsigned yPitch = stride(w);

    return h * (yPitch + (yPitch >> 1));
}
Beispiel #7
0
void Data::print() const{
	printf("%p + %d, (%d", mData, offset(), size(0));
	for(int i=1;i<highestDim(*this);++i){
		printf(",%d", size(i));
	}
	printf("):%+d, %s %s\n", stride(), typeToString(type()).c_str(), toToken().c_str());
}
void const* ms::GLPixelBuffer::as_argb_8888()
{
    if (pixels_need_y_flip)
    {
        auto const stride_val = stride().as_uint32_t();
        auto const height = size_.height.as_uint32_t();

        std::vector<char> tmp(stride_val);

        for (unsigned int i = 0; i < height / 2; i++)
        {
            /* Store line i */
            tmp.assign(&pixels[i * stride_val], &pixels[(i + 1) * stride_val]);

            /* Copy line height - i - 1 to line i */
            copy_and_convert_pixel_line(&pixels[(height - i - 1) * stride_val],
                                        &pixels[i * stride_val]);

            /* Copy stored line (i) to height - i - 1 */
            copy_and_convert_pixel_line(tmp.data(),
                                        &pixels[(height - i - 1) * stride_val]);
        }

        /* Process middle line if there is one */
        if (height % 2 == 1)
        {
            copy_and_convert_pixel_line(&pixels[(height / 2) * stride_val],
                                        &pixels[(height / 2) * stride_val]);
        }

        pixels_need_y_flip = false;
    }

    return pixels.data();
}
Beispiel #9
0
	Image& Image::flip()
	{
		// 1. パラメータチェック
		{
			if (isEmpty())
			{
				return *this;
			}
		}
		
		// 2. 処理
		{
			const int32 h = m_height, s = stride();
			Array<Color> line(m_width);
			Color* lineU = m_data.data();
			Color* lineB = lineU + m_width * (h - 1);
			
			for (int32 y = 0; y < h / 2; ++y)
			{
				::memcpy(line.data(), lineU, s);
				::memcpy(lineU, lineB, s);
				::memcpy(lineB, line.data(), s);
				
				lineU += m_width;
				lineB -= m_width;
			}
		}
		
		return *this;
	}
Beispiel #10
0
bool DenseMatrixBase::robustSolve(double* rhs, double* const workspace) const
{
	cadet_assert(_rows == _cols);

	// For LAPACK the matrix looks like it's transposed. We, thus,
	// work with the transposed (i.e., the original) matrix.
	// From LAPACK's point of view, we want to solve A^T * x = y with the factorization A = L * Q.
	// The solution is given by x = L^{-T} * Q * y.

	char side[] = "L";
	char transQ[] = "N";

	// Since LAPACK uses column-major storage and we use row-major,
	// we actually have constructed the transposed matrix.
	lapackInt_t n = _rows;
	lapackInt_t m = _cols;
	lapackInt_t nrhs = 1;
	lapackInt_t lda = stride();
	lapackInt_t flag = 0;

	// Calculate z = Q * y
	LapackMultiplyFactorizedQ(side, transQ, &m, &nrhs, &m, _data, &lda, workspace, rhs, &n, workspace + _rows, &n, &flag);
	if (flag != 0)
		return false;

	// Calculate x = L^{-T} * Q * y = L^{-T} * z
	char transL[] = "T";
	LapackSolveTriangular(side, transL, transQ, &m, &nrhs, _data, &lda, rhs, &n, &flag);
	return flag == 0;
}
Beispiel #11
0
VectorRecord VectorRecord::slice(
					Nullable<int64_t> nLow,
					Nullable<int64_t> nHigh,
					Nullable<int64_t> nStride
					)  const
	{
	if (!mDataPtr)
		return VectorRecord();

	if (nStride && *nStride == 0)
		return VectorRecord();

	IntegerSequence sequenceToUse =
		IntegerSequence(size(), offset(), stride()).slice(nLow, nHigh, nStride);

	if (!sequenceToUse.size())
		return VectorRecord();

	return VectorRecord(
		mDataPtr,
		sequenceToUse.size(),
		sequenceToUse.offset(),
		sequenceToUse.stride()
		);
	}
Beispiel #12
0
status_t LayerBitmap::getInfo(surface_info_t* info) const
{
    if (mSurface.data == 0) {
        memset(info, 0, sizeof(surface_info_t));
        info->bits_offset = NO_MEMORY;
        return NO_MEMORY;
    }
    info->w     = uint16_t(width());
    info->h     = uint16_t(height());
    info->stride= uint16_t(stride());
    info->bpr   = uint16_t(stride() * bytesPerPixel(pixelFormat()));
    info->format= uint8_t(pixelFormat());
    info->flags = surface_info_t::eBufferDirty;
    info->bits_offset = ssize_t(mOffset);
    return NO_ERROR;
}
Beispiel #13
0
bool VectorRecord::allValuesAreLoaded() const
	{
	if (!dataPtr() || !dataPtr()->pagedAndPageletTreeValueCount())
		return true;

	IntegerSequence curSlice(size(), offset(), stride());

	IntegerSequence restrictedSlice = curSlice.intersect(IntegerSequence(pagedAndPageletTreeValueCount()));

	if (restrictedSlice.size() == 0)
		return true;

	Nullable<long> slotIndex;
	Fora::Interpreter::ExecutionContext* context = Fora::Interpreter::ExecutionContext::currentExecutionContext();

	if (context)
		slotIndex = context->getCurrentBigvecSlotIndex();
	else
		slotIndex = 0;

	lassert(slotIndex);

	if (!dataPtr()->bigvecHandleForSlot(*slotIndex))
		return false;

	bool tr = dataPtr()->
		bigvecHandleForSlot(*slotIndex)->allValuesAreLoadedBetween(
			restrictedSlice.smallestValue(),
			restrictedSlice.largestValue() + 1
			);

	return tr;
	}
Beispiel #14
0
void DenseMatrixBase::submatrixMultiplyVector(const double* const x, unsigned int startRow, unsigned int startCol, 
			unsigned int numRows, unsigned int numCols, double alpha, double beta, double* const y) const
{
	cadet_assert(_rows > startRow);
	cadet_assert(_cols > startCol);
	cadet_assert(_rows >= startRow + numRows);
	cadet_assert(_cols >= startCol + numCols);

	// Since LAPACK uses column-major storage and we use row-major,
	// we actually have constructed the transposed matrix. Thus,
	// rows and columns interchange.
	lapackInt_t m = numCols;
	lapackInt_t n = numRows;
	lapackInt_t lda = stride();
	lapackInt_t inc = 1; // Stride in vectors (here, elements are continuous without intermediate space)

	// For LAPACK the matrix looks like it's transposed. We, thus,
	// multiply with the transposed matrix, which in the end uses the original matrix.
	char trans[] = "T";

	// Pointer to first entry of submatrix
	double* const data = const_cast<double*>(_data) + startRow * lda + startCol;

	// LAPACK computes y <- alpha * A * x + beta * y
	LapackMultiplyDense(trans, &m, &n, &alpha, data, &lda, const_cast<double*>(x), &inc, &beta, const_cast<double*>(y), &inc);
}
Beispiel #15
0
        static __device__ __forceinline__ void reduce_n(T* data, unsigned int n, BinOp op)
        {
            int ftid = flattenedThreadId();
            int sft = stride();

            if (sft < n)
            {
                for (unsigned int i = sft + ftid; i < n; i += sft)
                    data[ftid] = op(data[ftid], data[i]);

                __syncthreads();

                n = sft;
            }

            while (n > 1)
            {
                unsigned int half = n/2;

                if (ftid < half)
                    data[ftid] = op(data[ftid], data[n - ftid - 1]);

                __syncthreads();

                n = n - half;
            }
        }
Beispiel #16
0
int DenseMatrixBase::optimalLeastSquaresWorkspace() const
{
	cadet_assert(_rows >= _cols);

	// For LAPACK the matrix looks like it's transposed. We, thus,
	// solve the transposed equation which uses the original matrix.
	char trans[] = "T";

	// Since LAPACK uses column-major storage and we use row-major,
	// we actually have constructed the transposed matrix.
	lapackInt_t n = _rows;
	lapackInt_t m = _cols;
	lapackInt_t nrhs = 1;
	lapackInt_t lda = stride();
	lapackInt_t lwork = -1;
	lapackInt_t flag = 0;
	double work = 0.0;

	LapackDenseLeastSquares(trans, &m, &n, &nrhs, const_cast<double*>(_data), &lda, nullptr, &n, &work, &lwork, &flag);

	if (flag != 0)
		return -1;

	return static_cast<int>(work);
}
Beispiel #17
0
namespace Rfit {
  constexpr uint32_t maxNumberOfTracks() { return 5*1024; }
  constexpr uint32_t stride() { return maxNumberOfTracks();}
  // hits
  template<int N>
  using Matrix3xNd = Eigen::Matrix<double,3,N>;
  template<int N>
  using Map3xNd = Eigen::Map<Matrix3xNd<N>,0,Eigen::Stride<3*stride(),stride()> >;
  // errors
  template<int N>
  using Matrix6xNf = Eigen::Matrix<float,6,N>;
  template<int N>
  using Map6xNf = Eigen::Map<Matrix6xNf<N>,0,Eigen::Stride<6*stride(),stride()> >;
  // fast fit
  using Map4d = Eigen::Map<Vector4d,0,Eigen::InnerStride<stride()> >;

}
Beispiel #18
0
void MatrixView::set_mem_to(const double val)
{
  for (size_t c = 0; c < cols_mem_; ++c) {
    size_t offset = c * stride();
    for (size_t r = 0; r < rows_mem_; ++r)
      data_[r + offset] = val;
  }
}
Beispiel #19
0
        static __device__ __forceinline__ void fill(It beg, It end, const T& value)
        {
            int STRIDE = stride();
            It t = beg + flattenedThreadId();

            for(; t < end; t += STRIDE)
                *t = value;
        }
Beispiel #20
0
TypedFora::Abi::ForaValueArraySlice VectorRecord::sliceForOffset(int64_t index) const
	{
	lassert(mDataPtr);

	return mDataPtr->sliceForOffset(index * mStride + mOffset).compose(
		RangeToIntegerSequence(0, size(), offset(), stride())
		);
	}
Beispiel #21
0
static std::vector<int64_t> computeLinearStride(const Tensor & tensor) {
  // computes the stride as if tensor were contigous
  auto sizes = tensor.sizes();
  std::vector<int64_t> stride(tensor.dim());
  stride[tensor.dim() - 1] = 1;
  std::partial_sum(sizes.rbegin(), sizes.rend() - 1, stride.rbegin() + 1, std::multiplies<int64_t>());
  return stride;
}
Beispiel #22
0
	// set array pointers in VAO
	void set_pointer(size_t i, GLuint index) {
		size_t estride = stride();
		size_t offset = 0;
		for (size_t n = 0; n < i*2; n += 2)
			offset += _format[n + 1] * sizeof(float);

		glVertexAttribPointer(index, _format[i*2 + 1], GL_FLOAT, GL_FALSE, estride*sizeof(float), (const void*)offset);
		glEnableVertexAttribArray(i);
	}
Beispiel #23
0
MatrixView& MatrixView::operator*=(const double val)
{
  for (size_t c = 0; c < cols_; ++c) {
    size_t offset = c * stride();
    for (size_t r = 0; r < rows_; ++r)
      data_[r + offset] *= val;
  }
  return *this;
}
Beispiel #24
0
void DenseMatrixBase::scaleColumns(double const* scalingFactors, unsigned int numCols)
{
	const unsigned int ld = stride();
	for (unsigned int i = 0; i < _rows; ++i)
	{
		for (unsigned int j = 0; j < numCols; ++j)
			_data[i * ld + j] /= scalingFactors[j];
	}
}
Beispiel #25
0
void DenseMatrixBase::submatrixSetAll(double val, unsigned int startRow, unsigned int startCol, 
			unsigned int numRows, unsigned int numCols)
{
	cadet_assert(_rows > startRow);
	cadet_assert(_cols > startCol);
	cadet_assert(_rows >= startRow + numRows);
	cadet_assert(_cols >= startCol + numCols);

	double* const ptrDest = _data + startRow * stride() + startCol;

	for (unsigned int i = 0; i < numRows; ++i)
	{
		for (unsigned int j = 0; j < numCols; ++j)
		{
			ptrDest[i * stride() + j] = val;
		}
	}
}
Beispiel #26
0
        static __device__ __forceinline__ void yota(OutIt beg, OutIt end, T value)
        {
            int STRIDE = stride();
            int tid = flattenedThreadId();
            value += tid;

            for(OutIt t = beg + tid; t < end; t += STRIDE, value += STRIDE)
                *t = value;
        }
Beispiel #27
0
void DenseMatrixBase::scaleRows(double const* scalingFactors, unsigned int numRows)
{
	const unsigned int ld = stride();
	for (unsigned int i = 0; i < numRows; ++i)
	{
		for (unsigned int j = 0; j < _cols; ++j)
			_data[i * ld + j] /= scalingFactors[i];
	}
}
Beispiel #28
0
        static __device__ __forceinline__ void transfrom(InIt beg, InIt end, OutIt out, UnOp op)
        {
            int STRIDE = stride();
            InIt  t = beg + flattenedThreadId();
            OutIt o = out + (t - beg);

            for(; t < end; t += STRIDE, o += STRIDE)
                *o = op(*t);
        }
Beispiel #29
0
        static __device__ __forceinline__ void copy(InIt beg, InIt end, OutIt out)
        {
            int STRIDE = stride();
            InIt  t = beg + flattenedThreadId();
            OutIt o = out + (t - beg);

            for(; t < end; t += STRIDE, o += STRIDE)
                *o = *t;
        }
Beispiel #30
0
    void VolumeModel::bindAttributeArrays(QOpenGLShaderProgram * program) const {
        QMutexLocker locker (&modelMutex);

        program->enableAttributeArray(attributeArrays["vertex"]);
        program->setAttributeBuffer(attributeArrays["vertex"], GL_FLOAT, 0, 3, stride());

        program->enableAttributeArray(attributeArrays["tex"]);
        program->setAttributeBuffer(attributeArrays["tex"], GL_FLOAT, sizeof(GLfloat) * 3, 3, stride());
    }