Esempio n. 1
0
// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :(
static inline size_t find_change(const uint16_t * a, const uint16_t * b)
{
	const __m128i * a128=(const __m128i*)a;
	const __m128i * b128=(const __m128i*)b;
	
	while (true)
	{
		__m128i v0 = _mm_loadu_si128(a128);
		__m128i v1 = _mm_loadu_si128(b128);
		__m128i c = _mm_cmpeq_epi32(v0, v1);
		uint32_t mask = _mm_movemask_epi8(c);
		a128++;
		b128++;
		__m128i v0b = _mm_loadu_si128(a128);
		__m128i v1b = _mm_loadu_si128(b128);
		__m128i cb = _mm_cmpeq_epi32(v0b, v1b);
		uint32_t maskb = _mm_movemask_epi8(cb);
		if (mask != 0xffff || maskb != 0xffff) // Something has changed, figure out where.
		{
			if (mask == 0xffff) mask=maskb;
			else a128--;//ignore b128 since we'll return anyways
			size_t ret=(((char*)a128-(char*)a) | (compat_ctz(~mask))) >> 1;
			return (ret | (a[ret]==b[ret]));
		}
		a128++;
		b128++;
	}
}
Esempio n. 2
0
QT_BEGIN_NAMESPACE

bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionFlags)
{
    Q_ASSERT(data->format == QImage::Format_ARGB32);

    // extra pixels on each line
    const int spare = data->width & 3;
    // width in pixels of the pad at the end of each line
    const int pad = (data->bytes_per_line >> 2) - data->width;
    const int iter = data->width >> 2;
    int height = data->height;

    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
    const __m128i nullVector = _mm_setzero_si128();
    const __m128i half = _mm_set1_epi16(0x80);
    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);

    __m128i *d = reinterpret_cast<__m128i*>(data->data);
    while (height--) {
        const __m128i *end = d + iter;

        for (; d != end; ++d) {
            const __m128i srcVector = _mm_loadu_si128(d);
            const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
            if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
                // opaque, data is unchanged
            } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) {
                // fully transparent
                _mm_storeu_si128(d, nullVector);
            } else {
                __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
                alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));

                __m128i result;
                BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half);
                result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha);
                _mm_storeu_si128(d, result);
            }
        }

        QRgb *p = reinterpret_cast<QRgb*>(d);
        QRgb *pe = p+spare;
        for (; p != pe; ++p) {
            if (*p < 0x00ffffff)
                *p = 0;
            else if (*p < 0xff000000)
                *p = PREMUL(*p);
        }

        d = reinterpret_cast<__m128i*>(p+pad);
    }

    data->format = QImage::Format_ARGB32_Premultiplied;
    return true;
}
Esempio n. 3
0
const char *ssechr(const char *s, char ch)
{
    __m128i zero = _mm_setzero_si128();
    __m128i cx16 = _mm_set1_epi8(ch); // (ch) replicated 16 times.
    while (1) {
        __m128i  x = _mm_loadu_si128((__m128i const *)s);
        unsigned u = _mm_movemask_epi8(_mm_cmpeq_epi8(zero, x));
        unsigned v = _mm_movemask_epi8(_mm_cmpeq_epi8(cx16, x))
                & ~u & (u - 1);
        if (v) return s + __builtin_ctz(v) - 1;
        if (u) return  NULL;
        s += 16;
    }
}
Esempio n. 4
0
static int VectorMismatch_SSE2(const uint32_t* const array1,
                               const uint32_t* const array2, int length) {
  int match_len;

  if (length >= 12) {
    __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
    __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
    match_len = 0;
    do {
      // Loop unrolling and early load both provide a speedup of 10% for the
      // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
      const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
      const __m128i B0 =
          _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
      const __m128i B1 =
          _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
      if (_mm_movemask_epi8(cmpA) != 0xffff) break;
      match_len += 4;

      {
        const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
        A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
        A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
        if (_mm_movemask_epi8(cmpB) != 0xffff) break;
        match_len += 4;
      }
    } while (match_len + 12 < length);
  } else {
    match_len = 0;
    // Unroll the potential first two loops.
    if (length >= 4 &&
        _mm_movemask_epi8(_mm_cmpeq_epi32(
            _mm_loadu_si128((const __m128i*)&array1[0]),
            _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
      match_len = 4;
      if (length >= 8 &&
          _mm_movemask_epi8(_mm_cmpeq_epi32(
              _mm_loadu_si128((const __m128i*)&array1[4]),
              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
        match_len = 8;
      }
    }
  }

  while (match_len < length && array1[match_len] == array2[match_len]) {
    ++match_len;
  }
  return match_len;
}
Esempio n. 5
0
/*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize)
{
    u64* _GSmem = (u64*) GSmem;
    u64* clut = (u64*)GetClutBufferAddress<u32>(csa);

    while(clutsize > 0) {
#ifdef ZEROGS_SSE2
        // Note: local memory datas are swizzles
        __m128i GSmem_0 = _mm_load_si128((__m128i*)_GSmem);   // 9  8  1 0
        __m128i GSmem_1 = _mm_load_si128((__m128i*)_GSmem+1); // 11 10 3 2
        __m128i GSmem_2 = _mm_load_si128((__m128i*)_GSmem+2); // 13 12 5 4
        __m128i GSmem_3 = _mm_load_si128((__m128i*)_GSmem+3); // 15 14 7 6

        __m128i clut_0 = _mm_load_si128((__m128i*)clut);
        __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
        __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
        __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);

        __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_0, GSmem_1), clut_0);

        __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_2, GSmem_3), clut_1);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_0, GSmem_1), clut_2);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_2, GSmem_3), clut_3);
        result = _mm_and_si128(result, result_tmp);

        u32 result_int = _mm_movemask_epi8(result);
        if (result_int != 0xFFFF)
            return true;
#else
        // I see no point to keep an mmx version. SSE2 versions is probably faster.
        // Keep a slow portable C version for reference/debug
        // Note: local memory datas are swizzles
        if (clut[0] != _GSmem[0] || clut[1] != _GSmem[2] || clut[2] != _GSmem[4] || clut[3] != _GSmem[6]
                || clut[4] != _GSmem[1] || clut[5] != _GSmem[3] || clut[6] != _GSmem[5] || clut[7] != _GSmem[7])
            return true;
#endif

        // go to the next memory block
        _GSmem += 32;

        // go back to the previous memory block then down one memory column
        if (clutsize & 0x40) {
            _GSmem -= (64-8);
        }
        // In case previous operation (down one column) cross the block boundary
        // Go to the next block
        if (clutsize == 0x240) {
            _GSmem += 32;
        }

        clut += 8;
        clutsize -= 64;
    }

    return false;
}
Esempio n. 6
0
bool CPathUtils::ContainsEscapedChars(const char * psz, size_t length)
{
    // most of our strings will be tens of bytes long
    // -> affort some minor overhead to handle the main part very fast

    const char* end = psz + length;
    if (sse2supported)
    {
        __m128i mask = _mm_set_epi8 ( '%', '%', '%', '%', '%', '%', '%', '%'
            , '%', '%', '%', '%', '%', '%', '%', '%');

        for (; psz + sizeof (mask) <= end; psz += sizeof (mask))
        {
            // fetch the next 16 bytes from the source

            __m128i chunk = _mm_loadu_si128 ((const __m128i*)psz);

            // check for non-ASCII

            int flags = _mm_movemask_epi8 (_mm_cmpeq_epi8 (chunk, mask));
            if (flags != 0)
                return true;
        };
    }

    // return odd bytes at the end of the string

    for (; psz < end; ++psz)
        if (*psz == '%')
            return true;

    return false;
}
Esempio n. 7
0
/* Not so fussy on its alignment */
static void
threshold_16_SSE_unaligned(byte *contone_ptr, byte *thresh_ptr, byte *ht_data)
{
    __m128i input1;
    __m128i input2;
    int result_int;
    byte *sse_data;
    const unsigned int mask1 = 0x80808080;
    __m128i sign_fix = _mm_set_epi32(mask1, mask1, mask1, mask1);

    sse_data = (byte*) &(result_int);
    /* Load */
    input1 = _mm_loadu_si128((const __m128i *)contone_ptr);
    input2 = _mm_loadu_si128((const __m128i *) thresh_ptr);
    /* Unsigned subtraction does Unsigned saturation so we
       have to use the signed operation */
    input1 = _mm_xor_si128(input1, sign_fix);
    input2 = _mm_xor_si128(input2, sign_fix);
    /* Subtract the two */
    input2 = _mm_subs_epi8(input1, input2);
    /* Grab the sign mask */
    result_int = _mm_movemask_epi8(input2);
    /* bit wise reversal on 16 bit word */
    ht_data[0] = bitreverse[sse_data[0]];
    ht_data[1] = bitreverse[sse_data[1]];
}
Esempio n. 8
0
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_sse2(void* in, void* out, const size_t size,
					 const size_t elem_size) {
    /*  With a bit of care, this could be written such that such that it is */
    /*  in_buf = out_buf safe. */
    char* in_b = (char*) in;
    uint16_t* out_ui16 = (uint16_t*) out;

    size_t nbyte = elem_size * size;

    __m128i xmm;
    int32_t bt;
    size_t ii, jj, kk;
    size_t ind;

    CHECK_MULT_EIGHT(size);

    if (elem_size % 2) {
        bshuf_shuffle_bit_eightelem_scal(in, out, size, elem_size);
    } else {
        for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
                ii += 8 * elem_size) {
            for (jj = 0; jj + 15 < 8 * elem_size; jj += 16) {
                xmm = _mm_loadu_si128((__m128i *) &in_b[ii + jj]);
                for (kk = 0; kk < 8; kk++) {
                    bt = _mm_movemask_epi8(xmm);
                    xmm = _mm_slli_epi16(xmm, 1);
                    ind = (ii + jj / 8 + (7 - kk) * elem_size);
                    out_ui16[ind / 2] = bt;
                }
            }
        }
    }
    return size * elem_size;
}
static INLINE unsigned
build_mask_linear(int c, int dcdx, int dcdy)
{
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);

   /* Get values across the quad
    */
   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);

   /* pack pairs of results into epi16
    */
   __m128i cstep01 = _mm_packs_epi32(cstep0, cstep1);
   __m128i cstep23 = _mm_packs_epi32(cstep2, cstep3);

   /* pack into epi8, preserving sign bits
    */
   __m128i result = _mm_packs_epi16(cstep01, cstep23);

   /* extract sign bits to create mask
    */
   return _mm_movemask_epi8(result);
}
Esempio n. 10
0
int norx_aead_decrypt(
  unsigned char *m, size_t *mlen,
  const unsigned char *a, size_t alen,
  const unsigned char *c, size_t clen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{
    const __m128i K = LOADU(key);
    __m128i S[4];

    if (clen < BYTES(NORX_T)) { return -1; }

    *mlen = clen - BYTES(NORX_T);

    INITIALISE(S, nonce, K);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T));
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S, K);

    /* Verify tag */
    S[3] = _mm_cmpeq_epi8(S[3], LOADU(c + clen - BYTES(NORX_T)));
    return (((_mm_movemask_epi8(S[3]) & 0xFFFFU) + 1) >> 16) - 1;
}
Esempio n. 11
0
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_sse2(void* in, void* out, const size_t size,
				  const size_t elem_size) {

    char* in_b = (char*) in;
    char* out_b = (char*) out;
    uint16_t* out_ui16;
    int64_t count;
    size_t nbyte = elem_size * size;
    __m128i xmm;
    int32_t bt;
    size_t ii, kk;

    CHECK_MULT_EIGHT(nbyte);

    for (ii = 0; ii + 15 < nbyte; ii += 16) {
        xmm = _mm_loadu_si128((__m128i *) &in_b[ii]);
        for (kk = 0; kk < 8; kk++) {
            bt = _mm_movemask_epi8(xmm);
            xmm = _mm_slli_epi16(xmm, 1);
            out_ui16 = (uint16_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
            *out_ui16 = bt;
        }
    }
    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
            nbyte - nbyte % 16);
    return count;
}
Esempio n. 12
0
void
mandel_sse2(unsigned char *image, const struct spec *s)
{
    __m128 xmin = _mm_set_ps1(s->xlim[0]);
    __m128 ymin = _mm_set_ps1(s->ylim[0]);
    __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width);
    __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height);
    __m128 threshold = _mm_set_ps1(4);
    __m128 one = _mm_set_ps1(1);
    __m128i zero = _mm_setzero_si128();
    __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations);
    __m128 depth_scale = _mm_set_ps1(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 4) {
            __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0);
            __m128 my = _mm_set_ps1(y);
            __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin);
            __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin);
            __m128 zr = cr;
            __m128 zi = ci;
            int k = 1;
            __m128 mk = _mm_set_ps1(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m128 zr2 = _mm_mul_ps(zr, zr);
                __m128 zi2 = _mm_mul_ps(zi, zi);
                __m128 zrzi = _mm_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr);
                zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm_mul_ps(zr, zr);
                zi2 = _mm_mul_ps(zi, zi);
                __m128 mag2 = _mm_add_ps(zr2, zi2);
                __m128 mask = _mm_cmplt_ps(mag2, threshold);
                mk = _mm_add_ps(_mm_and_ps(mask, one), mk);

                /* Early bailout? */
                __m128i maski = _mm_castps_si128(mask);
                if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero)))
                    break;
            }
            mk = _mm_mul_ps(mk, iter_scale);
            mk = _mm_sqrt_ps(mk);
            mk = _mm_mul_ps(mk, depth_scale);
            __m128i pixels = _mm_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 4; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros  = _mm_setzero_si128();
    const __m128i prefix = sse::load(needle);
    const __m128i suffix = sse::load(needle + 4);
    const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i str = sse::load(s + i + bitpos + 4);
            const __m128i cmp = _mm_cmpeq_epi8(str, suffix);

            if (_mm_testc_si128(cmp, suff_mask)) {

                return i + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return std::string::npos;
}
void WriteBufferValidUTF8::nextImpl()
{
    char * p = memory.data();
    char * valid_start = p;

    while (p < pos)
    {
#ifdef __SSE2__
        /// Fast skip of ASCII
        static constexpr size_t SIMD_BYTES = 16;
        const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;

        while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
            p += SIMD_BYTES;

        if (!(p < pos))
            break;
#endif

        size_t len = length_of_utf8_sequence[static_cast<unsigned char>(*p)];

        if (len > 4)
        {
            /// Invalid start of sequence. Skip one byte.
            putValid(valid_start, p - valid_start);
            putReplacement();
            ++p;
            valid_start = p;
        }
        else if (p + len > pos)
        {
            /// Sequence was not fully written to this buffer.
            break;
        }
        else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char *>(p), len))
        {
            /// Valid sequence.
            p += len;
        }
        else
        {
            /// Invalid sequence. Skip just first byte.
            putValid(valid_start, p - valid_start);
            putReplacement();
            ++p;
            valid_start = p;
        }
    }

    putValid(valid_start, p - valid_start);

    size_t cnt = pos - p;

    /// Shift unfinished sequence to start of buffer.
    for (size_t i = 0; i < cnt; ++i)
        memory[i] = p[i];

    working_buffer = Buffer(&memory[cnt], memory.data() + memory.size());
}
Esempio n. 15
0
int test_mm_movemask_epi8(__m128i A) {
  // DAG-LABEL: test_mm_movemask_epi8
  // DAG: call i32 @llvm.x86.sse2.pmovmskb.128(<16 x i8> %{{.*}})
  //
  // ASM-LABEL: test_mm_movemask_epi8
  // ASM: pmovmskb
  return _mm_movemask_epi8(A);
}
void WriteBufferValidUTF8::nextImpl()
{
	char *p = &memory[0];
	char *valid_start = p;

	while (p < pos)
	{
#ifdef __x86_64__
		/// Быстрый пропуск ASCII
		static constexpr size_t SIMD_BYTES = 16;
		const char * simd_end = p + (pos - p) / SIMD_BYTES * SIMD_BYTES;

		while (p < simd_end && !_mm_movemask_epi8(_mm_loadu_si128(reinterpret_cast<const __m128i*>(p))))
			p += SIMD_BYTES;

		if (!(p < pos))
			break;
#endif

		size_t len = 1 + static_cast<size_t>(trailingBytesForUTF8[static_cast<unsigned char>(*p)]);

		if (len > 4)
		{
			/// Невалидное начало последовательности. Пропустим один байт.
			putValid(valid_start, p - valid_start);
			putReplacement();
			++p;
			valid_start = p;
		}
		else if (p + len > pos)
		{
			/// Еще не вся последовательность записана.
			break;
		}
		else if (Poco::UTF8Encoding::isLegal(reinterpret_cast<unsigned char*>(p), len))
		{
			/// Валидная последовательность.
			p += len;
		}
		else
		{
			/// Невалидная последовательность. Пропустим только первый байт.
			putValid(valid_start, p - valid_start);
			putReplacement();
			++p;
			valid_start = p;
		}
	}

	putValid(valid_start, p - valid_start);

	size_t cnt = pos - p;
	/// Сдвинем незаконченную последовательность в начало буфера.
	for (size_t i = 0; i < cnt; ++i)
		memory[i] = p[i];

	working_buffer = Buffer(&memory[cnt], &memory[0] + memory.size());
}
static INLINE void
build_masks(int c, 
	    int cdiff,
	    int dcdx,
	    int dcdy,
	    unsigned *outmask,
	    unsigned *partmask)
{
   __m128i cstep0 = _mm_setr_epi32(c, c+dcdx, c+dcdx*2, c+dcdx*3);
   __m128i xdcdy = _mm_set1_epi32(dcdy);

   /* Get values across the quad
    */
   __m128i cstep1 = _mm_add_epi32(cstep0, xdcdy);
   __m128i cstep2 = _mm_add_epi32(cstep1, xdcdy);
   __m128i cstep3 = _mm_add_epi32(cstep2, xdcdy);

   {
      __m128i cstep01, cstep23, result;

      cstep01 = _mm_packs_epi32(cstep0, cstep1);
      cstep23 = _mm_packs_epi32(cstep2, cstep3);
      result = _mm_packs_epi16(cstep01, cstep23);

      *outmask |= _mm_movemask_epi8(result);
   }


   {
      __m128i cio4 = _mm_set1_epi32(cdiff);
      __m128i cstep01, cstep23, result;

      cstep0 = _mm_add_epi32(cstep0, cio4);
      cstep1 = _mm_add_epi32(cstep1, cio4);
      cstep2 = _mm_add_epi32(cstep2, cio4);
      cstep3 = _mm_add_epi32(cstep3, cio4);

      cstep01 = _mm_packs_epi32(cstep0, cstep1);
      cstep23 = _mm_packs_epi32(cstep2, cstep3);
      result = _mm_packs_epi16(cstep01, cstep23);

      *partmask |= _mm_movemask_epi8(result);
   }
}
int64_t av1_highbd_block_error_sse2(tran_low_t *coeff, tran_low_t *dqcoeff,
                                     intptr_t block_size, int64_t *ssz,
                                     int bps) {
  int i, j, test;
  uint32_t temp[4];
  __m128i max, min, cmp0, cmp1, cmp2, cmp3;
  int64_t error = 0, sqcoeff = 0;
  const int shift = 2 * (bps - 8);
  const int rounding = shift > 0 ? 1 << (shift - 1) : 0;

  for (i = 0; i < block_size; i += 8) {
    // Load the data into xmm registers
    __m128i mm_coeff = _mm_load_si128((__m128i *)(coeff + i));
    __m128i mm_coeff2 = _mm_load_si128((__m128i *)(coeff + i + 4));
    __m128i mm_dqcoeff = _mm_load_si128((__m128i *)(dqcoeff + i));
    __m128i mm_dqcoeff2 = _mm_load_si128((__m128i *)(dqcoeff + i + 4));
    // Check if any values require more than 15 bit
    max = _mm_set1_epi32(0x3fff);
    min = _mm_set1_epi32(0xffffc000);
    cmp0 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff, max),
                         _mm_cmplt_epi32(mm_coeff, min));
    cmp1 = _mm_xor_si128(_mm_cmpgt_epi32(mm_coeff2, max),
                         _mm_cmplt_epi32(mm_coeff2, min));
    cmp2 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff, max),
                         _mm_cmplt_epi32(mm_dqcoeff, min));
    cmp3 = _mm_xor_si128(_mm_cmpgt_epi32(mm_dqcoeff2, max),
                         _mm_cmplt_epi32(mm_dqcoeff2, min));
    test = _mm_movemask_epi8(
        _mm_or_si128(_mm_or_si128(cmp0, cmp1), _mm_or_si128(cmp2, cmp3)));

    if (!test) {
      __m128i mm_diff, error_sse2, sqcoeff_sse2;
      mm_coeff = _mm_packs_epi32(mm_coeff, mm_coeff2);
      mm_dqcoeff = _mm_packs_epi32(mm_dqcoeff, mm_dqcoeff2);
      mm_diff = _mm_sub_epi16(mm_coeff, mm_dqcoeff);
      error_sse2 = _mm_madd_epi16(mm_diff, mm_diff);
      sqcoeff_sse2 = _mm_madd_epi16(mm_coeff, mm_coeff);
      _mm_storeu_si128((__m128i *)temp, error_sse2);
      error = error + temp[0] + temp[1] + temp[2] + temp[3];
      _mm_storeu_si128((__m128i *)temp, sqcoeff_sse2);
      sqcoeff += temp[0] + temp[1] + temp[2] + temp[3];
    } else {
      for (j = 0; j < 8; j++) {
        const int64_t diff = coeff[i + j] - dqcoeff[i + j];
        error += diff * diff;
        sqcoeff += (int64_t)coeff[i + j] * (int64_t)coeff[i + j];
      }
    }
  }
  assert(error >= 0 && sqcoeff >= 0);
  error = (error + rounding) >> shift;
  sqcoeff = (sqcoeff + rounding) >> shift;

  *ssz = sqcoeff;
  return error;
}
Esempio n. 19
0
void* memrchr(void *dst, int c, size_t len) {
	/* Backwards */
	uint8_t* a = dst;

	if(!len)
		return NULL;

	int i = len;
	int aligned_a = 0;

	aligned_a = ((uintptr_t)a & (sizeof(__m128i) - 1));
	/* aligned */
	if(aligned_a) {
		while(i && ((uintptr_t) &a[i] & ( sizeof(__m128i)-1))) {
			i--;
			if(a[i] == (char)c) {
				return a + i;
			}
		}
	}

	if(i >= 16) {
		uint32_t buf_32 = c;
		buf_32 |= (buf_32 << 8);
		buf_32 |= (buf_32 << 16);

		__m128i r1 = _mm_set_epi32(buf_32, buf_32, buf_32, buf_32);
		
		while(i >= 16) {
			i -= 16;
			__m128i x = _mm_loadu_si128((__m128i*)&(a[i])); //16byte
			__m128i cmp = _mm_cmpeq_epi8(x, r1);

			uint16_t result = (uint16_t)_mm_movemask_epi8(cmp);

			if(result != 0x0000U) {
				i += 15;
				while(!(result & 0x8000)) {
					result = result << 1;
					i--;
				}

				return a + i;
			}
		}
	}

	while(i) {
		i--;
		if(a[i] == (char)c) {
			return a + i;
		}
	}

	return NULL;
}
int SSEBinSearchBlock::search(uint32_t key) const {

    const __m128i keys = _mm_set1_epi32(key);
    __m128i v;

    int limit = data.size() - 1;
    int a = 0;
    int b = limit;

    while (a <= b) {
        const int c = (a + b)/2;

        if (data[c] == key) {
            return c;
        }

        if (key < data[c]) {
            b = c - 1;

            if (b >= 4) {
                v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[b - 4]));
                v = _mm_cmpeq_epi32(v, keys);
                const uint16_t mask = _mm_movemask_epi8(v);
                if (mask) {
                    return b - 4 + __builtin_ctz(mask)/4;
                }
            }
        } else {
            a = c + 1;

            if (a + 4 < limit) {
                v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[a]));
                v = _mm_cmpeq_epi32(v, keys);
                const uint16_t mask = _mm_movemask_epi8(v);
                if (mask) {
                    return a + __builtin_ctz(mask)/4;
                }
            }
        }
    }

    return -1;
}
Esempio n. 21
0
	virtual size_t match(const char* data, size_t size)
	{
		__m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter));
		__m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData));
		__m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask));

		size_t offset = firstLetterPos;

		while (offset + 32 <= size)
		{
			__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset));
			unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter));

			// advance offset regardless of match results to reduce number of live values
			offset += 16;

			while (mask != 0)
			{
				unsigned int pos = re2::countTrailingZeros(mask);
				size_t dataOffset = offset - 16 + pos - firstLetterOffset;

				mask &= ~(1 << pos);

				// check if we have a match
				__m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + dataOffset));
				__m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData));

				if (_mm_movemask_epi8(matchMask) == 0xffff)
				{
					size_t matchOffset = dataOffset + firstLetterOffset - firstLetterPos;

					// final check for full pattern
					if (matchOffset + pattern.size() < size && memcmp(data + matchOffset, pattern.c_str(), pattern.size()) == 0)
					{
						return matchOffset;
					}
				}
			}
		}

		return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos);
	}
Esempio n. 22
0
inline void matrix16x8::transpose(square128& output, int x, int y)
{
    for (int j = 0; j < 8; j++)
    {
        int row = _mm_movemask_epi8(whole);
        whole = _mm_slli_epi64(whole, 1);

        // _mm_movemask_epi8 uses most significant bit, hence +7-j
        output.doublebytes[8*x+7-j][y] = row;
    }
}
Esempio n. 23
0
int valuesOK(cl_float8* to, cl_float8* from, size_t length) {
#ifdef DEBUG
    printf("Checking data of size: %lu\n", length);
#endif
    for(int i = 0; i < length; ++i) {
#ifdef __SSE__
        __cl_float4 __hostFirstValue = to->v4[0];
        __cl_float4 __hostSecondValue = to->v4[1];
        __cl_float4 __deviceFirstValue = from->v4[0];
        __cl_float4 __deviceSecondValue = from->v4[1];
        __m128i vcmp = (__m128i) _mm_cmpneq_ps(__hostFirstValue, __deviceFirstValue);
        uint16_t test = _mm_movemask_epi8(vcmp);
        __m128i vcmp_2 = (__m128i) _mm_cmpneq_ps(__hostSecondValue, __deviceSecondValue);
        uint16_t test_2 = _mm_movemask_epi8(vcmp_2);
        if( (test|test_2) != 0 ) return 0; // indicative that the result failed
#else
        #error "SSE not supported, which is required for example code to work!"  
#endif
    }
  return 1;
}
Esempio n. 24
0
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
                         int width, int height,
                         uint8_t* dst, int dst_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
  int i, j;
  const __m128i zero = _mm_setzero_si128();
  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  __m128i all_alphas = all_0xff;

  // We must be able to access 3 extra bytes after the last written byte
  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
  // last byte of the quadruplet.
  const int limit = (width - 1) & ~7;

  for (j = 0; j < height; ++j) {
    __m128i* out = (__m128i*)dst;
    for (i = 0; i < limit; i += 8) {
      // load 8 alpha bytes
      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
      // load 8 dst pixels (32 bytes)
      const __m128i b0_lo = _mm_loadu_si128(out + 0);
      const __m128i b0_hi = _mm_loadu_si128(out + 1);
      // mask dst alpha values
      const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
      const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
      // combine
      const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
      const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
      // store
      _mm_storeu_si128(out + 0, b2_lo);
      _mm_storeu_si128(out + 1, b2_hi);
      // accumulate eight alpha 'and' in parallel
      all_alphas = _mm_and_si128(all_alphas, a0);
      out += 2;
    }
    for (; i < width; ++i) {
      const uint32_t alpha_value = alpha[i];
      dst[4 * i] = alpha_value;
      alpha_and &= alpha_value;
    }
    alpha += alpha_stride;
    dst += dst_stride;
  }
  // Combine the eight alpha 'and' into a 8-bit mask.
  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
  return (alpha_and != 0xff);
}
Esempio n. 25
0
static void SetResidualCoeffsSSE2(const int16_t* const coeffs,
                                  VP8Residual* const res) {
  const __m128i c0 = _mm_loadu_si128((const __m128i*)coeffs);
  const __m128i c1 = _mm_loadu_si128((const __m128i*)(coeffs + 8));
  // Use SSE to compare 8 values with a single instruction.
  const __m128i zero = _mm_setzero_si128();
  const __m128i m0 = _mm_cmpeq_epi16(c0, zero);
  const __m128i m1 = _mm_cmpeq_epi16(c1, zero);
  // Get the comparison results as a bitmask, consisting of two times 16 bits:
  // two identical bits for each result. Concatenate both bitmasks to get a
  // single 32 bit value. Negate the mask to get the position of entries that
  // are not equal to zero. We don't need to mask out least significant bits
  // according to res->first, since coeffs[0] is 0 if res->first > 0
  const uint32_t mask =
      ~(((uint32_t)_mm_movemask_epi8(m1) << 16) | _mm_movemask_epi8(m0));
  // The position of the most significant non-zero bit indicates the position of
  // the last non-zero value. Divide the result by two because __movemask_epi8
  // operates on 8 bit values instead of 16 bit values.
  assert(res->first == 0 || coeffs[0] == 0);
  res->last = mask ? (BitsLog2Floor(mask) >> 1) : -1;
  res->coeffs = coeffs;
}
Esempio n. 26
0
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
  int i;
  double retval = 0.;
  int sumX, sumXY;
  int32_t tmp[4];
  __m128i zero = _mm_setzero_si128();
  // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
  __m128i sumXY_128 = zero;
  __m128i sumX_128 = zero;

  for (i = 0; i < 256; i += 4) {
    const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
    const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));

    // Check if any X is non-zero: this actually provides a speedup as X is
    // usually sparse.
    if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
      const __m128i xy_128 = _mm_add_epi32(x, y);
      sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);

      sumX_128 = _mm_add_epi32(sumX_128, x);

      // Analyze the different X + Y.
      _mm_storeu_si128((__m128i*)tmp, xy_128);

      ANALYZE_XY(0);
      ANALYZE_XY(1);
      ANALYZE_XY(2);
      ANALYZE_XY(3);
    } else {
      // X is fully 0, so only deal with Y.
      sumXY_128 = _mm_add_epi32(sumXY_128, y);

      ANALYZE_X_OR_Y(Y, 0);
      ANALYZE_X_OR_Y(Y, 1);
      ANALYZE_X_OR_Y(Y, 2);
      ANALYZE_X_OR_Y(Y, 3);
    }
  }

  // Sum up sumX_128 to get sumX.
  _mm_storeu_si128((__m128i*)tmp, sumX_128);
  sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];

  // Sum up sumXY_128 to get sumXY.
  _mm_storeu_si128((__m128i*)tmp, sumXY_128);
  sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];

  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
  return (float)retval;
}
Esempio n. 27
0
	virtual size_t match(const char* data, size_t size)
	{
		__m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter));
		__m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData));
		__m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask));

		size_t offset = firstLetterPos;

		while (offset + 32 <= size)
		{
			__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset));
			int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter));

			if (mask == 0)
				offset += 16;
			else
			{
				offset += re2::countTrailingZeros(mask);

				// check if we have a match
				__m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset - firstLetterOffset));
				__m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData));

				if (_mm_movemask_epi8(matchMask) == 0xffff)
				{
					// final check for full pattern
					if (memcmp(data + offset - firstLetterPos, pattern.c_str(), pattern.size()) == 0)
					{
						return offset - firstLetterPos;
					}
				}

				offset += 1;
			}
		}

		return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos);
	}
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros     = _mm_setzero_si128();
    const __m128i prefix    = sse::load(needle);
    const __m128i suffix1   = sse::load(needle + 4);
    const __m128i suffix2   = sse::load(needle + 16 + 4);
    const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);
            const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);

            const __m128i c3 = _mm_or_si128(c2, suff_mask);
            const __m128i tmp = _mm_and_si128(c1, c3);

            if (_mm_movemask_epi8(tmp) == 0xffff) {

                return i + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return std::string::npos;
}
Esempio n. 29
0
 bool test(const index_t & kmer) const
 {
     __m128 __attribute__ ((aligned (16))) zero = _mm_setzero_si128();
     const size_t BitsPerElement = sizeof(block_t) * 8;
     const Hash hashfunction = Hash();
     kmer_t hashvalue = kmer;
     for (int hcount = this->h; hcount > 0; hcount--)
     {
         hashvalue = hashfunction(hashvalue);
         // we expect the compiler to automatically turn this into a shift because it's a const power of two
         size_t offset = (hashvalue % this->m) / BitsPerElement;
         if (_mm_movemask_epi8(
             _mm_cmpeq_epi32(
             _mm_and_ps(bitarray[offset],masks[hashvalue & (BitsPerElement-1)]),zero)) != 0xFFFF) return false;
     }
     return true;
 }
Esempio n. 30
0
File: main.cpp Progetto: CCJY/coliru
int countZeroBytes_SSE(char* values, int length) {
    int zeroCount = 0;
    __m128i zero16 = _mm_set1_epi8(0);
    __m128i and16 = _mm_set1_epi8(1);
    for(int i=0; i<length; i+=16) {
        __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]);
        __m128i cmp = _mm_cmpeq_epi8(values16, zero16);
        if(_mm_movemask_epi8(cmp)) {
            cmp = _mm_and_si128(and16, cmp); //change -1 values to 1
            //hortiontal sum of 16 bytes
            __m128i sum1 = _mm_sad_epu8(cmp,zero16);
            __m128i sum2 = _mm_shuffle_epi32(sum1,2);
            __m128i sum3 = _mm_add_epi16(sum1,sum2);
            zeroCount += _mm_cvtsi128_si32(sum3);
        }
    }
    return zeroCount;
}