Beispiel #1
0
void
maddrc4_shuffle_ssse3(uint8_t* region1, const uint8_t* region2, uint8_t constant, 
								size_t length)
{
	uint8_t *end;
	register __m128i in1, in2, out, t1, t2, m1, m2, l, h;

	if (constant == 0)
		return;

	if (constant == 1) {
		xorr_sse2(region1, region2, length);
		return;
	}
	
	t1 = _mm_loadu_si128((void *)tl[constant]);
	t2 = _mm_slli_epi64(t1, 4);
	m1 = _mm_set1_epi8(0x0f);
	m2 = _mm_set1_epi8(0xf0);

	for (end=region1+length; region1<end; region1+=16, region2+=16) {
		in2 = _mm_load_si128((void *)region2);
		in1 = _mm_load_si128((void *)region1);
		l = _mm_and_si128(in2, m1);
		l = _mm_shuffle_epi8(t1, l);
		h = _mm_and_si128(in2, m2);
		h = _mm_srli_epi64(h, 4);
		h = _mm_shuffle_epi8(t2, h);
		out = _mm_xor_si128(h,l);
		out = _mm_xor_si128(out, in1);
		_mm_store_si128((void *)region1, out);
	}
}
Beispiel #2
0
inline void casefoldRange(char* dest, const char* begin, const char* end)
{
	if (end - begin < 64)
	{
		// short string, don't bother optimizing
		for (const char* i = begin; i != end; ++i)
			*dest++ = casefold(*i);
	}
	else
	{
		// Shift 'A'..'Z' range ([65..90]) to [102..127] to use one signed comparison insn
		__m128i shiftAmount = _mm_set1_epi8(127 - 'Z');
		__m128i lowerBound = _mm_set1_epi8(127 - ('Z' - 'A') - 1);
		__m128i upperBit = _mm_set1_epi8(0x20);

		const char* i = begin;

		for (; i + 16 < end; i += 16)
		{
			__m128i v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(i));
			__m128i upperMask = _mm_cmpgt_epi8(_mm_add_epi8(v, shiftAmount), lowerBound);
			__m128i cfv = _mm_or_si128(v, _mm_and_si128(upperMask, upperBit));
			_mm_storeu_si128(reinterpret_cast<__m128i*>(dest), cfv);
			dest += 16;
		}

		for (; i != end; ++i)
			*dest++ = casefold(*i);
	}
}
Beispiel #3
0
void FragmentAttributesBuffer::SetAll(const FragmentAttributes &attr)
{
	size_t i = 0;
	
#ifdef ENABLE_SSE2
	const __m128i attrDepth_vec128				= _mm_set1_epi32(attr.depth);
	const __m128i attrOpaquePolyID_vec128		= _mm_set1_epi8(attr.opaquePolyID);
	const __m128i attrTranslucentPolyID_vec128	= _mm_set1_epi8(attr.translucentPolyID);
	const __m128i attrStencil_vec128			= _mm_set1_epi8(attr.stencil);
	const __m128i attrIsFogged_vec128			= _mm_set1_epi8(attr.isFogged);
	const __m128i attrIsTranslucentPoly_vec128	= _mm_set1_epi8(attr.isTranslucentPoly);
	
	const size_t sseCount = count - (count % 16);
	for (; i < sseCount; i += 16)
	{
		_mm_stream_si128((__m128i *)(this->depth +  0), attrDepth_vec128);
		_mm_stream_si128((__m128i *)(this->depth +  4), attrDepth_vec128);
		_mm_stream_si128((__m128i *)(this->depth +  8), attrDepth_vec128);
		_mm_stream_si128((__m128i *)(this->depth + 12), attrDepth_vec128);
		
		_mm_stream_si128((__m128i *)this->opaquePolyID, attrOpaquePolyID_vec128);
		_mm_stream_si128((__m128i *)this->translucentPolyID, attrTranslucentPolyID_vec128);
		_mm_stream_si128((__m128i *)this->stencil, attrStencil_vec128);
		_mm_stream_si128((__m128i *)this->isFogged, attrIsFogged_vec128);
		_mm_stream_si128((__m128i *)this->isTranslucentPoly, attrIsTranslucentPoly_vec128);
	}
#endif
	
	for (; i < count; i++)
	{
		this->SetAtIndex(i, attr);
	}
}
Beispiel #4
0
void
mulrc4_shuffle_ssse3(uint8_t *region, uint8_t constant, size_t length)
{
	uint8_t *end;
	register __m128i in, out, t1, t2, m1, m2, l, h;

	if (constant == 0) {
		memset(region, 0, length);
		return;
	}

	if (constant == 1)
		return;

	t1 = _mm_loadu_si128((void *)tl[constant]);
	t2 = _mm_slli_epi64(t1, 4);
	m1 = _mm_set1_epi8(0x0f);
	m2 = _mm_set1_epi8(0xf0);

	for (end=region+length; region<end; region+=16) {
		in = _mm_load_si128((void *)region);
		l = _mm_and_si128(in, m1);
		l = _mm_shuffle_epi8(t1, l);
		h = _mm_and_si128(in, m2);
		h = _mm_srli_epi64(h, 4);
		h = _mm_shuffle_epi8(t2, h);
		out = _mm_xor_si128(h, l);
		_mm_store_si128((void *)region, out);
	}
}
Beispiel #5
0
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src,
                                       int num_pixels, uint8_t* dst) {
  const __m128i mask_0x0f = _mm_set1_epi8(0x0f);
  const __m128i mask_0xf0 = _mm_set1_epi8(0xf0);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);    // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);    // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);    // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);    // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);   // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);   // r0...r7 | b0...b7
    const __m128i ga1 = _mm_srli_epi16(ga0, 4);         // g0-|g1-|...|a6-|a7-
    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0);  // -r0|-r1|...|-b6|-a7
    const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f);  // g0-|g1-|...|a6-|a7-
    const __m128i rgba0 = _mm_or_si128(ga2, rb1);       // rg0..rg7 | ba0..ba7
    const __m128i rgba1 = _mm_srli_si128(rgba0, 8);     // ba0..ba7 | 0
#if (WEBP_SWAP_16BIT_CSP == 1)
    const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0);  // barg0...barg7
#else
    const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1);  // rgba0...rgba7
#endif
    _mm_storeu_si128(out++, rgba);
    num_pixels -= 8;
  }
  // left-overs
  if (num_pixels > 0) {
    VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  }
}
Beispiel #6
0
static int GetResidualCostSSE2(int ctx0, const VP8Residual* const res) {
  uint8_t levels[16], ctxs[16];
  uint16_t abs_levels[16];
  int n = res->first;
  // should be prob[VP8EncBands[n]], but it's equivalent for n=0 or 1
  const int p0 = res->prob[n][ctx0][0];
  CostArrayPtr const costs = res->costs;
  const uint16_t* t = costs[n][ctx0];
  // bit_cost(1, p0) is already incorporated in t[] tables, but only if ctx != 0
  // (as required by the syntax). For ctx0 == 0, we need to add it here or it'll
  // be missing during the loop.
  int cost = (ctx0 == 0) ? VP8BitCost(1, p0) : 0;

  if (res->last < 0) {
    return VP8BitCost(0, p0);
  }

  {   // precompute clamped levels and contexts, packed to 8b.
    const __m128i zero = _mm_setzero_si128();
    const __m128i kCst2 = _mm_set1_epi8(2);
    const __m128i kCst67 = _mm_set1_epi8(MAX_VARIABLE_LEVEL);
    const __m128i c0 = _mm_loadu_si128((const __m128i*)&res->coeffs[0]);
    const __m128i c1 = _mm_loadu_si128((const __m128i*)&res->coeffs[8]);
    const __m128i D0 = _mm_sub_epi16(zero, c0);
    const __m128i D1 = _mm_sub_epi16(zero, c1);
    const __m128i E0 = _mm_max_epi16(c0, D0);   // abs(v), 16b
    const __m128i E1 = _mm_max_epi16(c1, D1);
    const __m128i F = _mm_packs_epi16(E0, E1);
    const __m128i G = _mm_min_epu8(F, kCst2);    // context = 0,1,2
    const __m128i H = _mm_min_epu8(F, kCst67);   // clamp_level in [0..67]

    _mm_storeu_si128((__m128i*)&ctxs[0], G);
    _mm_storeu_si128((__m128i*)&levels[0], H);

    _mm_storeu_si128((__m128i*)&abs_levels[0], E0);
    _mm_storeu_si128((__m128i*)&abs_levels[8], E1);
  }
  for (; n < res->last; ++n) {
    const int ctx = ctxs[n];
    const int level = levels[n];
    const int flevel = abs_levels[n];   // full level
    cost += VP8LevelFixedCosts[flevel] + t[level];  // simplified VP8LevelCost()
    t = costs[n + 1][ctx];
  }
  // Last coefficient is always non-zero
  {
    const int level = levels[n];
    const int flevel = abs_levels[n];
    assert(flevel != 0);
    cost += VP8LevelFixedCosts[flevel] + t[level];
    if (n < 15) {
      const int b = VP8EncBands[n + 1];
      const int ctx = ctxs[n];
      const int last_p0 = res->prob[b][ctx][0];
      cost += VP8BitCost(0, last_p0);
    }
  }
  return cost;
}
void png_read_filter_row_avg3_sse2(png_row_infop row_info, png_bytep row,
   png_const_bytep prev)
{
   /* The Avg filter predicts each pixel as the (truncated) average of a and b.
    * There's no pixel to the left of the first pixel.  Luckily, it's
    * predicted to be half of the pixel above it.  So again, this works
    * perfectly with our loop if we make sure a starts at zero.
    */

   png_size_t rb;

   const __m128i zero = _mm_setzero_si128();

   __m128i    b;
   __m128i a, d = zero;

   png_debug(1, "in png_read_filter_row_avg3_sse2");
   rb = row_info->rowbytes;
   while (rb >= 4) {
      __m128i avg;
             b = load4(prev);
      a = d; d = load4(row );

      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
      avg = _mm_avg_epu8(a,b);
      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
                                            _mm_set1_epi8(1)));
      d = _mm_add_epi8(d, avg);
      store3(row, d);

      prev += 3;
      row  += 3;
      rb   -= 3;
   }
   if (rb > 0) {
      __m128i avg;
             b = load3(prev);
      a = d; d = load3(row );

      /* PNG requires a truncating average, so we can't just use _mm_avg_epu8 */
      avg = _mm_avg_epu8(a,b);
      /* ...but we can fix it up by subtracting off 1 if it rounded up. */
      avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b),
                                            _mm_set1_epi8(1)));

      d = _mm_add_epi8(d, avg);
      store3(row, d);

      prev += 3;
      row  += 3;
      rb   -= 3;
   }
}
Beispiel #8
0
// input and output are int8_t
static WEBP_INLINE void DoSimpleFilter(__m128i* const p0, __m128i* const q0,
                                       const __m128i* const fl) {
    const __m128i k3 = _mm_set1_epi8(3);
    const __m128i k4 = _mm_set1_epi8(4);
    __m128i v3 = _mm_adds_epi8(*fl, k3);
    __m128i v4 = _mm_adds_epi8(*fl, k4);

    SignedShift8b(&v4);                  // v4 >> 3
    SignedShift8b(&v3);                  // v3 >> 3
    *q0 = _mm_subs_epi8(*q0, v4);        // q0 -= v4
    *p0 = _mm_adds_epi8(*p0, v3);        // p0 += v3
}
Beispiel #9
0
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
                        const __m128i* q1, int thresh, __m128i *mask) {
    __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
    *mask = _mm_set1_epi8(0xFE);
    t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
    t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2

    *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
    *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
    *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2

    t1 = _mm_set1_epi8(thresh);
    *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
    *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
}
Beispiel #10
0
/**
 * Adds 2 pixels (in a 2x1 grid) and multiplies them by 4. Resulting in a more
 * precise version of a box filter 4:2:2 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_422_lbd_ssse3(const uint8_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const __m128i fours = _mm_set1_epi8(4);
  __m128i *pred_buf_m128i = (__m128i *)pred_buf_q3;
  const __m128i *end = pred_buf_m128i + height * CFL_BUF_LINE_I128;
  do {
    if (width == 4) {
      __m128i top = _mm_loadh_epi32((__m128i *)input);
      top = _mm_maddubs_epi16(top, fours);
      _mm_storeh_epi32(pred_buf_m128i, top);
    } else if (width == 8) {
      __m128i top = _mm_loadl_epi64((__m128i *)input);
      top = _mm_maddubs_epi16(top, fours);
      _mm_storel_epi64(pred_buf_m128i, top);
    } else {
      __m128i top = _mm_loadu_si128((__m128i *)input);
      top = _mm_maddubs_epi16(top, fours);
      _mm_storeu_si128(pred_buf_m128i, top);
      if (width == 32) {
        __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        top_1 = _mm_maddubs_epi16(top_1, fours);
        _mm_storeu_si128(pred_buf_m128i + 1, top_1);
      }
    }
    input += input_stride;
    pred_buf_m128i += CFL_BUF_LINE_I128;
  } while (pred_buf_m128i < end);
}
        template <bool align> void EdgeBackgroundAdjustRangeMasked(uint8_t * backgroundCount, size_t backgroundCountStride, size_t width, size_t height, 
             uint8_t * backgroundValue, size_t backgroundValueStride, uint8_t threshold, const uint8_t * mask, size_t maskStride)
        {
            assert(width >= A);
            if(align)
            {
                assert(Aligned(backgroundValue) && Aligned(backgroundValueStride));
                assert(Aligned(backgroundCount) && Aligned(backgroundCountStride));
                assert(Aligned(mask) && Aligned(maskStride));
            }

            const __m128i _threshold = _mm_set1_epi8((char)threshold);
            size_t alignedWidth = AlignLo(width, A);
            __m128i tailMask = ShiftLeft(K8_01, A - width + alignedWidth);
            for(size_t row = 0; row < height; ++row)
            {
                for(size_t col = 0; col < alignedWidth; col += A)
                    EdgeBackgroundAdjustRangeMasked<align>(backgroundCount, backgroundValue, mask, col, _threshold, K8_01);
                if(alignedWidth != width)
                    EdgeBackgroundAdjustRangeMasked<false>(backgroundCount, backgroundValue, mask, width - A, _threshold, tailMask);
                backgroundValue += backgroundValueStride;
                backgroundCount += backgroundCountStride;
                mask += maskStride;
            }		
        }
Beispiel #12
0
void sk_avg_sse2(png_row_infop row_info, uint8_t* row, const uint8_t* prev) {
    // The Avg filter predicts each pixel as the (truncated) average of a and b.
    // There's no pixel to the left of the first pixel.  Luckily, it's
    // predicted to be half of the pixel above it.  So again, this works
    // perfectly with our loop if we make sure a starts at zero.
    const __m128i zero = _mm_setzero_si128();
    __m128i    b;
    __m128i a, d = zero;

    int rb = row_info->rowbytes;
    while (rb > 0) {
        b = load<bpp>(prev);
        a = d;
        d = load<bpp>(row );

        // PNG requires a truncating average here, so sadly we can't just use _mm_avg_epu8...
        __m128i avg = _mm_avg_epu8(a,b);
        // ...but we can fix it up by subtracting off 1 if it rounded up.
        avg = _mm_sub_epi8(avg, _mm_and_si128(_mm_xor_si128(a,b), _mm_set1_epi8(1)));

        d = _mm_add_epi8(d, avg);
        store<bpp>(row, d);

        prev += bpp;
        row  += bpp;
        rb   -= bpp;
    }
}
Beispiel #13
0
static void
clamphigh_u8_sse (uint8_t *dest, const uint8_t *src1, int n,
    const uint8_t *src2_1)
{
  __m128i xmm1;
  uint8_t max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    uint8_t x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set1_epi8(max);
  for (; n >= 16; n -= 16) {
    __m128i xmm0;
    xmm0 = _mm_loadu_si128((__m128i *)src1);
    xmm0 = _mm_min_epu8(xmm0, xmm1);
    _mm_store_si128((__m128i *)dest, xmm0);
    dest += 16;
    src1 += 16;
  }
  for (; n > 0; n--) {
    uint8_t x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
void imageFilterMean_SSE2(unsigned char *src1, unsigned char *src2, unsigned char *dst, int length)
{
    int n = length;

    // Compute first few values so we're on a 16-byte boundary in dst
    while( (((long)dst & 0xF) > 0) && (n > 0) ) {
        MEAN_PIXEL();
        --n; ++dst; ++src1; ++src2;
    }

    // Do bulk of processing using SSE2 (find the mean of 16 8-bit unsigned integers, with saturation)
    __m128i mask = _mm_set1_epi8(0x7F);
    while(n >= 16) {
        __m128i s1 = _mm_loadu_si128((__m128i*)src1);
        s1 = _mm_srli_epi16(s1, 1); // shift right 1
        s1 = _mm_and_si128(s1, mask); // apply byte-mask
        __m128i s2 = _mm_loadu_si128((__m128i*)src2);
        s2 = _mm_srli_epi16(s2, 1); // shift right 1
        s2 = _mm_and_si128(s2, mask); // apply byte-mask
        __m128i r = _mm_adds_epu8(s1, s2);
        _mm_store_si128((__m128i*)dst, r);

        n -= 16; src1 += 16; src2 += 16; dst += 16;
    }

    // If any bytes are left over, deal with them individually
    ++n;
    BASIC_MEAN();
}
Beispiel #15
0
__m128i aes_schedule_round(__m128i* rcon, __m128i input1, __m128i input2)
   {
   if(rcon)
      {
      input2 = _mm_xor_si128(_mm_alignr_epi8(_mm_setzero_si128(), *rcon, 15),
                             input2);

      *rcon = _mm_alignr_epi8(*rcon, *rcon, 15); // next rcon

      input1 = _mm_shuffle_epi32(input1, 0xFF); // rotate
      input1 = _mm_alignr_epi8(input1, input1, 1);
      }

   __m128i smeared = _mm_xor_si128(input2, _mm_slli_si128(input2, 4));
   smeared = mm_xor3(smeared, _mm_slli_si128(smeared, 8), _mm_set1_epi8(0x5B));

   __m128i t = _mm_srli_epi32(_mm_andnot_si128(low_nibs, input1), 4);

   input1 = _mm_and_si128(low_nibs, input1);

   __m128i t2 = _mm_shuffle_epi8(k_inv2, input1);

   input1 = _mm_xor_si128(input1, t);

   __m128i t3 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, t));
   __m128i t4 = _mm_xor_si128(t2, _mm_shuffle_epi8(k_inv1, input1));

   __m128i t5 = _mm_xor_si128(input1, _mm_shuffle_epi8(k_inv1, t3));
   __m128i t6 = _mm_xor_si128(t, _mm_shuffle_epi8(k_inv1, t4));

   return mm_xor3(_mm_shuffle_epi8(sb1u, t5),
                  _mm_shuffle_epi8(sb1t, t6),
                  smeared);
   }
static int HafCpu_Histogram3Thresholds_DATA_U8
	(
		vx_uint32     dstHist[],
		vx_uint8      distThreshold0,
		vx_uint8      distThreshold1,
		vx_uint8      distThreshold2,
		vx_uint32     srcWidth,
		vx_uint32     srcHeight,
		vx_uint8    * pSrcImage,
		vx_uint32     srcImageStrideInBytes
	)
{
	// offset: to convert the range from 0..255 to -128..127, because SSE does not have compare instructions for unsigned bytes
	// thresh: source threshold in -128..127 range
	__m128i offset = _mm_set1_epi8((char)0x80);
	__m128i T0 = _mm_set1_epi8((char)((distThreshold0 - 1) ^ 0x80));
	__m128i T1 = _mm_set1_epi8((char)((distThreshold1 - 1) ^ 0x80));
	__m128i T2 = _mm_set1_epi8((char)((distThreshold2 - 1) ^ 0x80));
	__m128i onemask = _mm_set1_epi8((char)1);
	// process one pixel row at a time that counts "pixel < srcThreshold"
	__m128i count0 = _mm_set1_epi8((char)0);
	__m128i count1 = _mm_set1_epi8((char)0);
	__m128i count2 = _mm_set1_epi8((char)0);
	vx_uint8 * srcRow = pSrcImage;
	vx_uint32 width = (srcWidth + 15) >> 4;
	for (unsigned int y = 0; y < srcHeight; y++) {
		__m128i * src = (__m128i *)srcRow;
		for (unsigned int x = 0; x < width; x++) {
			__m128i pixels = _mm_load_si128(src++);
			pixels = _mm_xor_si128(pixels, offset);
			__m128i cmpout;
			cmpout = _mm_cmpgt_epi8(pixels, T0);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count0 = _mm_add_epi32(count0, cmpout);
			cmpout = _mm_cmpgt_epi8(pixels, T1);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count1 = _mm_add_epi32(count1, cmpout);
			cmpout = _mm_cmpgt_epi8(pixels, T2);
			cmpout = _mm_and_si128(cmpout, onemask);
			cmpout = _mm_sad_epu8(cmpout, onemask);
			count2 = _mm_add_epi32(count2, cmpout);
		}
		srcRow += srcImageStrideInBytes;
	}
	// extract histogram from count: special case needed when T1 == T2
	dstHist[0] = M128I(count0).m128i_u32[0] + M128I(count0).m128i_u32[2];
	dstHist[1] = M128I(count1).m128i_u32[0] + M128I(count1).m128i_u32[2] - dstHist[0];
	dstHist[2] = M128I(count2).m128i_u32[0] + M128I(count2).m128i_u32[2] - dstHist[0] - dstHist[1];
	dstHist[3] = srcWidth * srcHeight - dstHist[0] - dstHist[1] - dstHist[2];
	if (M128I(T1).m128i_i8[0] == M128I(T2).m128i_i8[0]) {
		dstHist[2] = dstHist[3];
		dstHist[3] = 0;
	}
	return AGO_SUCCESS;
}
    SIMDValue SIMDInt8x16Operation::OpSplat(int8 x)
    {
        X86SIMDValue x86Result;
        // set 16 signed 8-bit integers values to input value x
        x86Result.m128i_value = _mm_set1_epi8(x);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Beispiel #18
0
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* p1, __m128i *p0,
                                  __m128i* q0, __m128i* q1,
                                  const __m128i* mask, int hev_thresh) {
    __m128i not_hev;
    __m128i t1, t2, t3;
    const __m128i sign_bit = _mm_set1_epi8(0x80);

    // compute hev mask
    GET_NOTHEV(*p1, *p0, *q0, *q1, hev_thresh, not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    // Do +4 side
    t2 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    t3 = t2;                           // save t2
    *q0 = _mm_subs_epi8(*q0, t2);      // q0 -= t2

    // Now do +3 side
    t2 = _mm_set1_epi8(3);
    t2 = _mm_adds_epi8(t1, t2);        // +3 instead of +4
    SIGNED_SHIFT_N(t2, 3);             // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2

    t2 = _mm_set1_epi8(1);
    t3 = _mm_adds_epi8(t3, t2);
    SIGNED_SHIFT_N(t3, 1);             // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 4

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3

    // unoffset
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);
}
Beispiel #19
0
static WEBP_INLINE void Average2_m128i(const __m128i* const a0,
                                       const __m128i* const a1,
                                       __m128i* const avg) {
  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
  const __m128i ones = _mm_set1_epi8(1);
  const __m128i avg1 = _mm_avg_epu8(*a0, *a1);
  const __m128i one = _mm_and_si128(_mm_xor_si128(*a0, *a1), ones);
  *avg = _mm_sub_epi8(avg1, one);
}
Beispiel #20
0
  Bitboard operator ~ () const {
#if defined (HAVE_SSE2) || defined (HAVE_SSE4)
    Bitboard tmp;
    _mm_store_si128(&tmp.m_, _mm_andnot_si128(this->m_, _mm_set1_epi8(static_cast<char>(0xffu))));
    return tmp;
#else
    return Bitboard(~this->p(0), ~this->p(1));
#endif
  }
Beispiel #21
0
void
png_read_filter_row_avg4_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   __m128i* rp = (__m128i*)row;
   const __m128i* prp = (const __m128i*)prev_row;
   __m128i pixel = _mm_setzero_si128();
   const __m128i mask = _mm_set1_epi8(0x01);

   for (i = (row_info->rowbytes + 15) >> 4; i > 0; i--)
   {
      __m128i prb = _mm_load_si128(prp++);
      __m128i rb = _mm_load_si128(rp);

      // First pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Second pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Third pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
      prb = _mm_srli_si128(prb, 4);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      // Fourth pixel
      pixel = calculate_pixel_avg(rb, prb, pixel, mask);
#ifndef __SSSE3__
      rb = _mm_srli_si128(rb, 4);
      rb = _mm_or_si128(rb, _mm_slli_si128(pixel, 12));
#else
      rb = _mm_alignr_epi8(pixel, rb, 4);
#endif

      _mm_store_si128(rp++, rb);
   }
}
Beispiel #22
0
// input pixels are uint8_t
static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
                                    const __m128i* const p0,
                                    const __m128i* const q0,
                                    const __m128i* const q1,
                                    int thresh, __m128i* const mask) {
    const __m128i m_thresh = _mm_set1_epi8(thresh);
    const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
    const __m128i kFE = _mm_set1_epi8(0xFE);
    const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
    const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2

    const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
    const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
    const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2

    const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
    *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
}
Beispiel #23
0
int countZeroBytes_SSE(char* values, int length) {
    int zeroCount = 0;
    __m128i zero16 = _mm_set1_epi8(0);
    __m128i and16 = _mm_set1_epi8(1);
    for(int i=0; i<length; i+=16) {
        __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]);
        __m128i cmp = _mm_cmpeq_epi8(values16, zero16);
        if(_mm_movemask_epi8(cmp)) {
            cmp = _mm_and_si128(and16, cmp); //change -1 values to 1
            //hortiontal sum of 16 bytes
            __m128i sum1 = _mm_sad_epu8(cmp,zero16);
            __m128i sum2 = _mm_shuffle_epi32(sum1,2);
            __m128i sum3 = _mm_add_epi16(sum1,sum2);
            zeroCount += _mm_cvtsi128_si32(sum3);
        }
    }
    return zeroCount;
}
Beispiel #24
0
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0,
	const Uint8* source1, Uint8* dest)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source0[i * 16]);
		t1 = _mm_load_si128((__m128i*)&source1[i * 16]);
		t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t2 = _mm_unpacklo_epi8(t2, t2);
		t2 = _mm_unpacklo_epi16(t2, t2);

		t3 = _mm_unpacklo_epi8(t0, t0);
		t4 = _mm_unpacklo_epi8(t1, t1);

		t5 = _mm_unpacklo_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t9 = _mm_adds_epu16(t7, t8);
		t9 = _mm_srli_epi16(t9, 8);

		t3 = _mm_unpackhi_epi8(t0, t0);
		t4 = _mm_unpackhi_epi8(t1, t1);

		t5 = _mm_unpackhi_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t10 = _mm_adds_epu16(t7, t8);
		t10 = _mm_srli_epi16(t10, 8);

		t10 = _mm_packus_epi16(t9, t10);

		_mm_stream_si128((__m128i*)&dest[i * 16], t10);
	}
}
mlib_status
__mlib_VectorConvert_S8_U8_Sat(
	mlib_s8 *z,
	const mlib_u8 *x,
	mlib_s32 n)
{
	if (n < 1)
		return (MLIB_FAILURE);

	mlib_s32 i, ax, az, nstep, n1, n2, n3, xval;
	mlib_u8 *px = (mlib_u8 *)x;
	mlib_s8 *pz = (mlib_s8 *)z;
	__m128i zbuf, xbuf, mask;
	mask = _mm_set1_epi8(127);

	ax = (mlib_addr)x & 15;
	az = (mlib_addr)z & 15;

	nstep = 16 / sizeof (mlib_u8);
	n1 = ((16 - ax) & 15) / sizeof (mlib_u8);
	n2 = (n - n1) / nstep;
	n3 = n - n1 - n2 * nstep;

	if (n2 < 1) {
		for (i = 0; i < n; i++) {
			xval = *px++;
			if (xval > 127)
				xval = 127;
			*pz++ = xval;
		}
	} else {
		for (i = 0; i < n1; i++) {
			xval = *px++;
			if (xval > 127)
				xval = 127;
			*pz++ = xval;
		}

		for (i = 0; i < n2; i++) {
			xbuf = _mm_load_si128((__m128i *)px);
			zbuf = _mm_min_epu8(xbuf, mask);
			_mm_storeu_si128((__m128i *)pz, zbuf);
			px += nstep;
			pz += nstep;
		}

		for (i = 0; i < n3; i++) {
			xval = *px++;
			if (xval > 127)
				xval = 127;
			*pz++ = xval;
		}
	}

	return (MLIB_SUCCESS);
}
Beispiel #26
0
__m128i aes_schedule_mangle_last_dec(__m128i k)
   {
   const __m128i deskew1 = _mm_set_epi32(
      0x1DFEB95A, 0x5DBEF91A, 0x07E4A340, 0x47A4E300);
   const __m128i deskew2 = _mm_set_epi32(
      0x2841C2AB, 0xF49D1E77, 0x5F36B5DC, 0x83EA6900);

   k = _mm_xor_si128(k, _mm_set1_epi8(0x5B));
   return aes_schedule_transform(k, deskew1, deskew2);
   }
Beispiel #27
0
static void HE16(uint8_t* dst) {     // horizontal
  int j;
  const __m128i kShuffle3 = _mm_set1_epi8(3);
  for (j = 16; j > 0; --j) {
    const __m128i in = _mm_cvtsi32_si128(*(int*)(dst - 4));
    const __m128i values = _mm_shuffle_epi8(in, kShuffle3);
    _mm_storeu_si128((__m128i*)dst, values);
    dst += BPS;
  }
}
Beispiel #28
0
mlib_status
__mlib_VectorSet_S8(
	mlib_s8 *z,
	const mlib_s8 *c,
	mlib_s32 n)
{
	mlib_s8 c0 = *c;
	__m128i val = _mm_set1_epi8(c0);
	SET_VALUE(mlib_s8);
}
Beispiel #29
0
static WEBP_INLINE void Average2_uint32(const uint32_t a0, const uint32_t a1,
                                        __m128i* const avg) {
  // (a + b) >> 1 = ((a + b + 1) >> 1) - ((a ^ b) & 1)
  const __m128i ones = _mm_set1_epi8(1);
  const __m128i A0 = _mm_cvtsi32_si128(a0);
  const __m128i A1 = _mm_cvtsi32_si128(a1);
  const __m128i avg1 = _mm_avg_epu8(A0, A1);
  const __m128i one = _mm_and_si128(_mm_xor_si128(A0, A1), ones);
  *avg = _mm_sub_epi8(avg1, one);
}
Beispiel #30
0
// Applies filter on 4 pixels (p1, p0, q0 and q1)
static WEBP_INLINE void DoFilter4(__m128i* const p1, __m128i* const p0,
                                  __m128i* const q0, __m128i* const q1,
                                  const __m128i* const mask, int hev_thresh) {
    const __m128i sign_bit = _mm_set1_epi8(0x80);
    const __m128i k64 = _mm_set1_epi8(0x40);
    const __m128i zero = _mm_setzero_si128();
    __m128i not_hev;
    __m128i t1, t2, t3;

    // compute hev mask
    GetNotHEV(p1, p0, q0, q1, hev_thresh, &not_hev);

    // convert to signed values
    FLIP_SIGN_BIT4(*p1, *p0, *q0, *q1);

    t1 = _mm_subs_epi8(*p1, *q1);        // p1 - q1
    t1 = _mm_andnot_si128(not_hev, t1);  // hev(p1 - q1)
    t2 = _mm_subs_epi8(*q0, *p0);        // q0 - p0
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 1 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 2 * (q0 - p0)
    t1 = _mm_adds_epi8(t1, t2);          // hev(p1 - q1) + 3 * (q0 - p0)
    t1 = _mm_and_si128(t1, *mask);       // mask filter values we don't care about

    t2 = _mm_set1_epi8(3);
    t3 = _mm_set1_epi8(4);
    t2 = _mm_adds_epi8(t1, t2);        // 3 * (q0 - p0) + (p1 - q1) + 3
    t3 = _mm_adds_epi8(t1, t3);        // 3 * (q0 - p0) + (p1 - q1) + 4
    SignedShift8b(&t2);                // (3 * (q0 - p0) + hev(p1 - q1) + 3) >> 3
    SignedShift8b(&t3);                // (3 * (q0 - p0) + hev(p1 - q1) + 4) >> 3
    *p0 = _mm_adds_epi8(*p0, t2);      // p0 += t2
    *q0 = _mm_subs_epi8(*q0, t3);      // q0 -= t3
    FLIP_SIGN_BIT2(*p0, *q0);

    // this is equivalent to signed (a + 1) >> 1 calculation
    t2 = _mm_add_epi8(t3, sign_bit);
    t3 = _mm_avg_epu8(t2, zero);
    t3 = _mm_sub_epi8(t3, k64);

    t3 = _mm_and_si128(not_hev, t3);   // if !hev
    *q1 = _mm_subs_epi8(*q1, t3);      // q1 -= t3
    *p1 = _mm_adds_epi8(*p1, t3);      // p1 += t3
    FLIP_SIGN_BIT2(*p1, *q1);
}