Exemplo n.º 1
0
int norx_aead_decrypt(
  unsigned char *m, size_t *mlen,
  const unsigned char *a, size_t alen,
  const unsigned char *c, size_t clen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{
    const __m128i K0 = LOADU(key +  0);
    const __m128i K1 = LOADU(key + 16);
    __m128i S[8];

    if (clen < BYTES(NORX_T)) { return -1; }

    *mlen = clen - BYTES(NORX_T);

    INITIALISE(S, nonce, K0, K1);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T));
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S, K0, K1);

    /* Verify tag */
    S[6] = _mm_cmpeq_epi8(S[6], LOADU(c + clen - BYTES(NORX_T)  ));
    S[7] = _mm_cmpeq_epi8(S[7], LOADU(c + clen - BYTES(NORX_T)/2));
    return (((_mm_movemask_epi8(AND(S[6], S[7])) & 0xFFFFUL) + 1) >> 16) - 1;
}
Exemplo n.º 2
0
const char *ssechr(const char *s, char ch)
{
    __m128i zero = _mm_setzero_si128();
    __m128i cx16 = _mm_set1_epi8(ch); // (ch) replicated 16 times.
    while (1) {
        __m128i  x = _mm_loadu_si128((__m128i const *)s);
        unsigned u = _mm_movemask_epi8(_mm_cmpeq_epi8(zero, x));
        unsigned v = _mm_movemask_epi8(_mm_cmpeq_epi8(cx16, x))
                & ~u & (u - 1);
        if (v) return s + __builtin_ctz(v) - 1;
        if (u) return  NULL;
        s += 16;
    }
}
Exemplo n.º 3
0
 SIMD_INLINE bool RowHasIndex(const uint8_t * mask, size_t alignedSize, size_t fullSize, __m128i index)
 {
     for (size_t col = 0; col < alignedSize; col += A)
     {
         if(!_mm_testz_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + col)), index), K_INV_ZERO))
             return true;
     }
     if(alignedSize != fullSize)
     {
         if(!_mm_testz_si128(_mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)(mask + fullSize - A)), index), K_INV_ZERO))
             return true;
     }
     return false;
 }
size_t sse4_strstr_unrolled_max20(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros  = _mm_setzero_si128();
    const __m128i prefix = sse::load(needle);
    const __m128i suffix = sse::load(needle + 4);
    const __m128i suff_mask = sse::mask_lower_bytes(needle_size - 4);

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i str = sse::load(s + i + bitpos + 4);
            const __m128i cmp = _mm_cmpeq_epi8(str, suffix);

            if (_mm_testc_si128(cmp, suff_mask)) {

                return i + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return std::string::npos;
}
Exemplo n.º 5
0
int norx_aead_decrypt(
  unsigned char *m, size_t *mlen,
  const unsigned char *a, size_t alen,
  const unsigned char *c, size_t clen,
  const unsigned char *z, size_t zlen,
  const unsigned char *nonce,
  const unsigned char *key
)
{
    __m128i S[4];

    if (clen < BYTES(NORX_T)) { return -1; }

    *mlen = clen - BYTES(NORX_T);

    INITIALISE(S, nonce, key);
    ABSORB_DATA(S, a, alen, HEADER_TAG);
    DECRYPT_DATA(S, m, c, clen - BYTES(NORX_T));
    ABSORB_DATA(S, z, zlen, TRAILER_TAG);
    FINALISE(S);

    /* Verify tag */
    S[0] = _mm_cmpeq_epi8(S[0], LOADU(c + clen - BYTES(NORX_T)));
    return (((_mm_movemask_epi8(S[0]) & 0xFFFFU) + 1) >> 16) - 1;
}
Exemplo n.º 6
0
bool CPathUtils::ContainsEscapedChars(const char * psz, size_t length)
{
    // most of our strings will be tens of bytes long
    // -> affort some minor overhead to handle the main part very fast

    const char* end = psz + length;
    if (sse2supported)
    {
        __m128i mask = _mm_set_epi8 ( '%', '%', '%', '%', '%', '%', '%', '%'
            , '%', '%', '%', '%', '%', '%', '%', '%');

        for (; psz + sizeof (mask) <= end; psz += sizeof (mask))
        {
            // fetch the next 16 bytes from the source

            __m128i chunk = _mm_loadu_si128 ((const __m128i*)psz);

            // check for non-ASCII

            int flags = _mm_movemask_epi8 (_mm_cmpeq_epi8 (chunk, mask));
            if (flags != 0)
                return true;
        };
    }

    // return odd bytes at the end of the string

    for (; psz < end; ++psz)
        if (*psz == '%')
            return true;

    return false;
}
Exemplo n.º 7
0
void
mandel_sse2(unsigned char *image, const struct spec *s)
{
    __m128 xmin = _mm_set_ps1(s->xlim[0]);
    __m128 ymin = _mm_set_ps1(s->ylim[0]);
    __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width);
    __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height);
    __m128 threshold = _mm_set_ps1(4);
    __m128 one = _mm_set_ps1(1);
    __m128i zero = _mm_setzero_si128();
    __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations);
    __m128 depth_scale = _mm_set_ps1(s->depth - 1);

    #pragma omp parallel for schedule(dynamic, 1)
    for (int y = 0; y < s->height; y++) {
        for (int x = 0; x < s->width; x += 4) {
            __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0);
            __m128 my = _mm_set_ps1(y);
            __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin);
            __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin);
            __m128 zr = cr;
            __m128 zi = ci;
            int k = 1;
            __m128 mk = _mm_set_ps1(k);
            while (++k < s->iterations) {
                /* Compute z1 from z0 */
                __m128 zr2 = _mm_mul_ps(zr, zr);
                __m128 zi2 = _mm_mul_ps(zi, zi);
                __m128 zrzi = _mm_mul_ps(zr, zi);
                /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */
                /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */
                zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr);
                zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci);

                /* Increment k */
                zr2 = _mm_mul_ps(zr, zr);
                zi2 = _mm_mul_ps(zi, zi);
                __m128 mag2 = _mm_add_ps(zr2, zi2);
                __m128 mask = _mm_cmplt_ps(mag2, threshold);
                mk = _mm_add_ps(_mm_and_ps(mask, one), mk);

                /* Early bailout? */
                __m128i maski = _mm_castps_si128(mask);
                if (0xFFFF == _mm_movemask_epi8(_mm_cmpeq_epi8(maski, zero)))
                    break;
            }
            mk = _mm_mul_ps(mk, iter_scale);
            mk = _mm_sqrt_ps(mk);
            mk = _mm_mul_ps(mk, depth_scale);
            __m128i pixels = _mm_cvtps_epi32(mk);
            unsigned char *dst = image + y * s->width * 3 + x * 3;
            unsigned char *src = (unsigned char *)&pixels;
            for (int i = 0; i < 4; i++) {
                dst[i * 3 + 0] = src[i * 4];
                dst[i * 3 + 1] = src[i * 4];
                dst[i * 3 + 2] = src[i * 4];
            }
        }
    }
}
Exemplo n.º 8
0
// Denoise a 16x1 vector with a weaker filter.
static INLINE __m128i vp9_denoiser_adj_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i k_0, const __m128i k_delta, __m128i acc_diff) {
  __m128i v_running_avg_y = _mm_loadu_si128((__m128i *)(&running_avg_y[0]));
  // Calculate differences.
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, k_0);
  // Clamp absolute difference to delta to get the adjustment.
  const __m128i adj = _mm_min_epu8(_mm_or_si128(pdiff, ndiff), k_delta);
  // Restore the sign and get positive and negative adjustments.
  __m128i padj, nadj;
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);
  // Calculate filtered value.
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, padj);
  v_running_avg_y = _mm_adds_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Accumulate the adjustments.
  acc_diff = _mm_subs_epi8(acc_diff, padj);
  acc_diff = _mm_adds_epi8(acc_diff, nadj);
  return acc_diff;
}
Exemplo n.º 9
0
template<> void
copyMask_<uchar>(const uchar* _src, size_t sstep, const uchar* mask, size_t mstep, uchar* _dst, size_t dstep, Size size)
{
    for( ; size.height--; mask += mstep, _src += sstep, _dst += dstep )
    {
        const uchar* src = (const uchar*)_src;
        uchar* dst = (uchar*)_dst;
        int x = 0;
        #if CV_SSE4_2
		if(USE_SSE4_2)//
		{
			__m128i zero = _mm_setzero_si128 ();
	
			 for( ; x <= size.width - 16; x += 16 )
			 {
				 const __m128i rSrc = _mm_lddqu_si128((const __m128i*)(src+x));
				 __m128i _mask = _mm_lddqu_si128((const __m128i*)(mask+x)); 
				 __m128i rDst = _mm_lddqu_si128((__m128i*)(dst+x));
				 __m128i _negMask = _mm_cmpeq_epi8(_mask, zero);
				 rDst = _mm_blendv_epi8(rSrc, rDst, _negMask); 
				 _mm_storeu_si128((__m128i*)(dst + x), rDst);
			 }
		}
        #endif
        for( ; x < size.width; x++ )
            if( mask[x] )
                dst[x] = src[x];
    }
}
Exemplo n.º 10
0
__m128i test_mm_cmpeq_epi8(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_cmpeq_epi8
  // DAG: icmp eq <16 x i8>
  //
  // ASM-LABEL: test_mm_cmpeq_epi8
  // ASM: pcmpeqb
  return _mm_cmpeq_epi8(A, B);
}
Exemplo n.º 11
0
    SIMDValue SIMDInt8x16Operation::OpEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemplo n.º 12
0
void* memrchr(void *dst, int c, size_t len) {
	/* Backwards */
	uint8_t* a = dst;

	if(!len)
		return NULL;

	int i = len;
	int aligned_a = 0;

	aligned_a = ((uintptr_t)a & (sizeof(__m128i) - 1));
	/* aligned */
	if(aligned_a) {
		while(i && ((uintptr_t) &a[i] & ( sizeof(__m128i)-1))) {
			i--;
			if(a[i] == (char)c) {
				return a + i;
			}
		}
	}

	if(i >= 16) {
		uint32_t buf_32 = c;
		buf_32 |= (buf_32 << 8);
		buf_32 |= (buf_32 << 16);

		__m128i r1 = _mm_set_epi32(buf_32, buf_32, buf_32, buf_32);
		
		while(i >= 16) {
			i -= 16;
			__m128i x = _mm_loadu_si128((__m128i*)&(a[i])); //16byte
			__m128i cmp = _mm_cmpeq_epi8(x, r1);

			uint16_t result = (uint16_t)_mm_movemask_epi8(cmp);

			if(result != 0x0000U) {
				i += 15;
				while(!(result & 0x8000)) {
					result = result << 1;
					i--;
				}

				return a + i;
			}
		}
	}

	while(i) {
		i--;
		if(a[i] == (char)c) {
			return a + i;
		}
	}

	return NULL;
}
Exemplo n.º 13
0
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const
{
    __m128i** ptrSSE = (__m128i**) ptr;

    // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
    int cnt = 0;
    for( int n = FREAK_NB_PAIRS/128; n-- ; )
    {
        __m128i result128 = _mm_setzero_si128();
        for( int m = 128/16; m--; cnt += 16 )
        {
            __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i],
                                            pointsValue[descriptionPairs[cnt+1].i],
                                            pointsValue[descriptionPairs[cnt+2].i],
                                            pointsValue[descriptionPairs[cnt+3].i],
                                            pointsValue[descriptionPairs[cnt+4].i],
                                            pointsValue[descriptionPairs[cnt+5].i],
                                            pointsValue[descriptionPairs[cnt+6].i],
                                            pointsValue[descriptionPairs[cnt+7].i],
                                            pointsValue[descriptionPairs[cnt+8].i],
                                            pointsValue[descriptionPairs[cnt+9].i],
                                            pointsValue[descriptionPairs[cnt+10].i],
                                            pointsValue[descriptionPairs[cnt+11].i],
                                            pointsValue[descriptionPairs[cnt+12].i],
                                            pointsValue[descriptionPairs[cnt+13].i],
                                            pointsValue[descriptionPairs[cnt+14].i],
                                            pointsValue[descriptionPairs[cnt+15].i]);

            __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j],
                                            pointsValue[descriptionPairs[cnt+1].j],
                                            pointsValue[descriptionPairs[cnt+2].j],
                                            pointsValue[descriptionPairs[cnt+3].j],
                                            pointsValue[descriptionPairs[cnt+4].j],
                                            pointsValue[descriptionPairs[cnt+5].j],
                                            pointsValue[descriptionPairs[cnt+6].j],
                                            pointsValue[descriptionPairs[cnt+7].j],
                                            pointsValue[descriptionPairs[cnt+8].j],
                                            pointsValue[descriptionPairs[cnt+9].j],
                                            pointsValue[descriptionPairs[cnt+10].j],
                                            pointsValue[descriptionPairs[cnt+11].j],
                                            pointsValue[descriptionPairs[cnt+12].j],
                                            pointsValue[descriptionPairs[cnt+13].j],
                                            pointsValue[descriptionPairs[cnt+14].j],
                                            pointsValue[descriptionPairs[cnt+15].j]);

            __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
            workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers

            workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
            result128 = _mm_or_si128(result128, workReg);
        }
        (**ptrSSE) = result128;
        ++(*ptrSSE);
    }
    (*ptrSSE) -= 8;
}
Exemplo n.º 14
0
    SIMDValue SIMDInt8x16Operation::OpGreaterThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_max_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value);  //  max(a,b) == b
        x86Result.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, x86Result.m128i_value); //

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemplo n.º 15
0
	virtual size_t match(const char* data, size_t size)
	{
		__m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter));
		__m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData));
		__m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask));

		size_t offset = firstLetterPos;

		while (offset + 32 <= size)
		{
			__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset));
			unsigned int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter));

			// advance offset regardless of match results to reduce number of live values
			offset += 16;

			while (mask != 0)
			{
				unsigned int pos = re2::countTrailingZeros(mask);
				size_t dataOffset = offset - 16 + pos - firstLetterOffset;

				mask &= ~(1 << pos);

				// check if we have a match
				__m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + dataOffset));
				__m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData));

				if (_mm_movemask_epi8(matchMask) == 0xffff)
				{
					size_t matchOffset = dataOffset + firstLetterOffset - firstLetterPos;

					// final check for full pattern
					if (matchOffset + pattern.size() < size && memcmp(data + matchOffset, pattern.c_str(), pattern.size()) == 0)
					{
						return matchOffset;
					}
				}
			}
		}

		return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos);
	}
Exemplo n.º 16
0
 SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m128i index, uint8_t * cols)
 {
     __m128i _cols = _mm_setzero_si128();
     for (size_t row = 0; row < size; ++row)
     {
         _cols = _mm_or_si128(_cols, _mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)mask), index));
         mask += stride;
     }
     _mm_storeu_si128((__m128i*)cols, _cols);
     return !_mm_testz_si128(_cols, K_INV_ZERO);
 }
Exemplo n.º 17
0
__m64 _m_pcmpeqb(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_cmpeq_epi8(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Exemplo n.º 18
0
    SIMDValue SIMDInt8x16Operation::OpNotEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a != b?

        X86SIMDValue negativeOnes = { { -1, -1, -1, -1 } };
        x86Result.m128i_value = _mm_andnot_si128(x86Result.m128i_value, negativeOnes.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemplo n.º 19
0
static int DispatchAlpha(const uint8_t* alpha, int alpha_stride,
                         int width, int height,
                         uint8_t* dst, int dst_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
  int i, j;
  const __m128i zero = _mm_setzero_si128();
  const __m128i rgb_mask = _mm_set1_epi32(0xffffff00u);  // to preserve RGB
  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  __m128i all_alphas = all_0xff;

  // We must be able to access 3 extra bytes after the last written byte
  // 'dst[4 * width - 4]', because we don't know if alpha is the first or the
  // last byte of the quadruplet.
  const int limit = (width - 1) & ~7;

  for (j = 0; j < height; ++j) {
    __m128i* out = (__m128i*)dst;
    for (i = 0; i < limit; i += 8) {
      // load 8 alpha bytes
      const __m128i a0 = _mm_loadl_epi64((const __m128i*)&alpha[i]);
      const __m128i a1 = _mm_unpacklo_epi8(a0, zero);
      const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero);
      const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero);
      // load 8 dst pixels (32 bytes)
      const __m128i b0_lo = _mm_loadu_si128(out + 0);
      const __m128i b0_hi = _mm_loadu_si128(out + 1);
      // mask dst alpha values
      const __m128i b1_lo = _mm_and_si128(b0_lo, rgb_mask);
      const __m128i b1_hi = _mm_and_si128(b0_hi, rgb_mask);
      // combine
      const __m128i b2_lo = _mm_or_si128(b1_lo, a2_lo);
      const __m128i b2_hi = _mm_or_si128(b1_hi, a2_hi);
      // store
      _mm_storeu_si128(out + 0, b2_lo);
      _mm_storeu_si128(out + 1, b2_hi);
      // accumulate eight alpha 'and' in parallel
      all_alphas = _mm_and_si128(all_alphas, a0);
      out += 2;
    }
    for (; i < width; ++i) {
      const uint32_t alpha_value = alpha[i];
      dst[4 * i] = alpha_value;
      alpha_and &= alpha_value;
    }
    alpha += alpha_stride;
    dst += dst_stride;
  }
  // Combine the eight alpha 'and' into a 8-bit mask.
  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
  return (alpha_and != 0xff);
}
Exemplo n.º 20
0
static WEBP_INLINE void ComplexMask(const __m128i* const p1,
                                    const __m128i* const p0,
                                    const __m128i* const q0,
                                    const __m128i* const q1,
                                    int thresh, int ithresh,
                                    __m128i* const mask) {
    const __m128i it = _mm_set1_epi8(ithresh);
    const __m128i diff = _mm_subs_epu8(*mask, it);
    const __m128i thresh_mask = _mm_cmpeq_epi8(diff, _mm_setzero_si128());
    __m128i filter_mask;
    NeedsFilter(p1, p0, q0, q1, thresh, &filter_mask);
    *mask = _mm_and_si128(thresh_mask, filter_mask);
}
Exemplo n.º 21
0
	virtual size_t match(const char* data, size_t size)
	{
		__m128i firstLetter = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->firstLetter));
		__m128i patternData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternData));
		__m128i patternMask = _mm_loadu_si128(reinterpret_cast<const __m128i*>(this->patternMask));

		size_t offset = firstLetterPos;

		while (offset + 32 <= size)
		{
			__m128i value = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset));
			int mask = _mm_movemask_epi8(_mm_cmpeq_epi8(value, firstLetter));

			if (mask == 0)
				offset += 16;
			else
			{
				offset += re2::countTrailingZeros(mask);

				// check if we have a match
				__m128i patternMatch = _mm_loadu_si128(reinterpret_cast<const __m128i*>(data + offset - firstLetterOffset));
				__m128i matchMask = _mm_or_si128(patternMask, _mm_cmpeq_epi8(patternMatch, patternData));

				if (_mm_movemask_epi8(matchMask) == 0xffff)
				{
					// final check for full pattern
					if (memcmp(data + offset - firstLetterPos, pattern.c_str(), pattern.size()) == 0)
					{
						return offset - firstLetterPos;
					}
				}

				offset += 1;
			}
		}

		return findMatch(pattern.c_str(), pattern.size(), data, size, offset - firstLetterPos);
	}
size_t sse4_strstr_unrolled_max36(const char* s, size_t n, const char* needle, size_t needle_size) {

    const __m128i zeros     = _mm_setzero_si128();
    const __m128i prefix    = sse::load(needle);
    const __m128i suffix1   = sse::load(needle + 4);
    const __m128i suffix2   = sse::load(needle + 16 + 4);
    const __m128i suff_mask = sse::mask_higher_bytes(needle_size - (16 + 4));

    for (size_t i = 0; i < n; i += 8) {

        const __m128i data   = sse::load(s + i);
        const __m128i result = _mm_mpsadbw_epu8(data, prefix, 0);

        const __m128i cmp    = _mm_cmpeq_epi16(result, zeros);

        unsigned mask = _mm_movemask_epi8(cmp) & 0x5555;

        while (mask != 0) {

            const auto bitpos = bits::get_first_bit_set(mask)/2;

            const __m128i c1 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 4), suffix1);
            const __m128i c2 = _mm_cmpeq_epi8(sse::load(s + i + bitpos + 16 + 4), suffix2);

            const __m128i c3 = _mm_or_si128(c2, suff_mask);
            const __m128i tmp = _mm_and_si128(c1, c3);

            if (_mm_movemask_epi8(tmp) == 0xffff) {

                return i + bitpos;
            }

            mask = bits::clear_leftmost_set(mask);
        }
    }

    return std::string::npos;
}
Exemplo n.º 23
0
// Denoise a 16x1 vector.
static INLINE __m128i vp9_denoiser_16x1_sse2(
    const uint8_t *sig, const uint8_t *mc_running_avg_y, uint8_t *running_avg_y,
    const __m128i *k_0, const __m128i *k_4, const __m128i *k_8,
    const __m128i *k_16, const __m128i *l3, const __m128i *l32,
    const __m128i *l21, __m128i acc_diff) {
  // Calculate differences
  const __m128i v_sig = _mm_loadu_si128((const __m128i *)(&sig[0]));
  const __m128i v_mc_running_avg_y =
      _mm_loadu_si128((const __m128i *)(&mc_running_avg_y[0]));
  __m128i v_running_avg_y;
  const __m128i pdiff = _mm_subs_epu8(v_mc_running_avg_y, v_sig);
  const __m128i ndiff = _mm_subs_epu8(v_sig, v_mc_running_avg_y);
  // Obtain the sign. FF if diff is negative.
  const __m128i diff_sign = _mm_cmpeq_epi8(pdiff, *k_0);
  // Clamp absolute difference to 16 to be used to get mask. Doing this
  // allows us to use _mm_cmpgt_epi8, which operates on signed byte.
  const __m128i clamped_absdiff =
      _mm_min_epu8(_mm_or_si128(pdiff, ndiff), *k_16);
  // Get masks for l2 l1 and l0 adjustments.
  const __m128i mask2 = _mm_cmpgt_epi8(*k_16, clamped_absdiff);
  const __m128i mask1 = _mm_cmpgt_epi8(*k_8, clamped_absdiff);
  const __m128i mask0 = _mm_cmpgt_epi8(*k_4, clamped_absdiff);
  // Get adjustments for l2, l1, and l0.
  __m128i adj2 = _mm_and_si128(mask2, *l32);
  const __m128i adj1 = _mm_and_si128(mask1, *l21);
  const __m128i adj0 = _mm_and_si128(mask0, clamped_absdiff);
  __m128i adj, padj, nadj;

  // Combine the adjustments and get absolute adjustments.
  adj2 = _mm_add_epi8(adj2, adj1);
  adj = _mm_sub_epi8(*l3, adj2);
  adj = _mm_andnot_si128(mask0, adj);
  adj = _mm_or_si128(adj, adj0);

  // Restore the sign and get positive and negative adjustments.
  padj = _mm_andnot_si128(diff_sign, adj);
  nadj = _mm_and_si128(diff_sign, adj);

  // Calculate filtered value.
  v_running_avg_y = _mm_adds_epu8(v_sig, padj);
  v_running_avg_y = _mm_subs_epu8(v_running_avg_y, nadj);
  _mm_storeu_si128((__m128i *)running_avg_y, v_running_avg_y);

  // Adjustments <=7, and each element in acc_diff can fit in signed
  // char.
  acc_diff = _mm_adds_epi8(acc_diff, padj);
  acc_diff = _mm_subs_epi8(acc_diff, nadj);
  return acc_diff;
}
Exemplo n.º 24
0
static void NeedsFilter(const __m128i* p1, const __m128i* p0, const __m128i* q0,
                        const __m128i* q1, int thresh, __m128i *mask) {
    __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
    *mask = _mm_set1_epi8(0xFE);
    t1 = _mm_and_si128(t1, *mask);        // set lsb of each byte to zero
    t1 = _mm_srli_epi16(t1, 1);           // abs(p1 - q1) / 2

    *mask = MM_ABS(*p0, *q0);             // abs(p0 - q0)
    *mask = _mm_adds_epu8(*mask, *mask);  // abs(p0 - q0) * 2
    *mask = _mm_adds_epu8(*mask, t1);     // abs(p0 - q0) * 2 + abs(p1 - q1) / 2

    t1 = _mm_set1_epi8(thresh);
    *mask = _mm_subs_epu8(*mask, t1);     // mask <= thresh
    *mask = _mm_cmpeq_epi8(*mask, _mm_setzero_si128());
}
    SIMDValue SIMDUint8x16Operation::OpLessThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue signBits = { { 0x80808080,0x80808080, 0x80808080, 0x80808080 } };

        // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1
        tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value);
        tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value);
        x86Result.m128i_value = _mm_cmplt_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b?
        tmpaValue.m128i_value = _mm_cmpeq_epi8(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b?
        x86Result.m128i_value = _mm_or_si128(x86Result.m128i_value, tmpaValue.m128i_value);   // result = (a<b)|(a==b)

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemplo n.º 26
0
// input/output is uint8_t
static WEBP_INLINE void GetNotHEV(const __m128i* const p1,
                                  const __m128i* const p0,
                                  const __m128i* const q0,
                                  const __m128i* const q1,
                                  int hev_thresh, __m128i* const not_hev) {
    const __m128i zero = _mm_setzero_si128();
    const __m128i t_1 = MM_ABS(*p1, *p0);
    const __m128i t_2 = MM_ABS(*q1, *q0);

    const __m128i h = _mm_set1_epi8(hev_thresh);
    const __m128i t_3 = _mm_subs_epu8(t_1, h);  // abs(p1 - p0) - hev_tresh
    const __m128i t_4 = _mm_subs_epu8(t_2, h);  // abs(q1 - q0) - hev_tresh

    *not_hev = _mm_or_si128(t_3, t_4);
    *not_hev = _mm_cmpeq_epi8(*not_hev, zero);  // not_hev <= t1 && not_hev <= t2
}
Exemplo n.º 27
0
int main()
{
  int8_t m[16] = {45, 37, 35, 45, 37, 35, 45, 37, 35, 45, 37, 35,  0, 0,  0, 0};
  int8_t s[16] = {-4, -4, -4, -4,  1,  1, -4, -4, -4, -4,  1,  1, -4,-4, -4, -4};

  __m128i M = _mm_load_si128((__m128i*)m);
  __m128i S = _mm_load_si128((__m128i*)s);
  __m128i zero = _mm_set1_epi32(0);

  //__m128i sum = _mm_mask_add_epi8(zero, _mm_cmpneq_epi8_mask(M, zero), S,S);
  __m128i sum = _mm_andnot_si128(_mm_cmpeq_epi8(M, zero), _mm_add_epi8(M, S));
  print128(M);
  print128(S);
  print128(sum);

  return 0;
}
Exemplo n.º 28
0
// input pixels are uint8_t
static WEBP_INLINE void NeedsFilter(const __m128i* const p1,
                                    const __m128i* const p0,
                                    const __m128i* const q0,
                                    const __m128i* const q1,
                                    int thresh, __m128i* const mask) {
    const __m128i m_thresh = _mm_set1_epi8(thresh);
    const __m128i t1 = MM_ABS(*p1, *q1);        // abs(p1 - q1)
    const __m128i kFE = _mm_set1_epi8(0xFE);
    const __m128i t2 = _mm_and_si128(t1, kFE);  // set lsb of each byte to zero
    const __m128i t3 = _mm_srli_epi16(t2, 1);   // abs(p1 - q1) / 2

    const __m128i t4 = MM_ABS(*p0, *q0);        // abs(p0 - q0)
    const __m128i t5 = _mm_adds_epu8(t4, t4);   // abs(p0 - q0) * 2
    const __m128i t6 = _mm_adds_epu8(t5, t3);   // abs(p0-q0)*2 + abs(p1-q1)/2

    const __m128i t7 = _mm_subs_epu8(t6, m_thresh);  // mask <= m_thresh
    *mask = _mm_cmpeq_epi8(t7, _mm_setzero_si128());
}
Exemplo n.º 29
0
Arquivo: main.cpp Projeto: CCJY/coliru
int countZeroBytes_SSE(char* values, int length) {
    int zeroCount = 0;
    __m128i zero16 = _mm_set1_epi8(0);
    __m128i and16 = _mm_set1_epi8(1);
    for(int i=0; i<length; i+=16) {
        __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]);
        __m128i cmp = _mm_cmpeq_epi8(values16, zero16);
        if(_mm_movemask_epi8(cmp)) {
            cmp = _mm_and_si128(and16, cmp); //change -1 values to 1
            //hortiontal sum of 16 bytes
            __m128i sum1 = _mm_sad_epu8(cmp,zero16);
            __m128i sum2 = _mm_shuffle_epi32(sum1,2);
            __m128i sum3 = _mm_add_epi16(sum1,sum2);
            zeroCount += _mm_cvtsi128_si32(sum3);
        }
    }
    return zeroCount;
}
Exemplo n.º 30
0
static int ExtractAlpha(const uint8_t* argb, int argb_stride,
                        int width, int height,
                        uint8_t* alpha, int alpha_stride) {
  // alpha_and stores an 'and' operation of all the alpha[] values. The final
  // value is not 0xff if any of the alpha[] is not equal to 0xff.
  uint32_t alpha_and = 0xff;
  int i, j;
  const __m128i a_mask = _mm_set1_epi32(0xffu);  // to preserve alpha
  const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u);
  __m128i all_alphas = all_0xff;

  // We must be able to access 3 extra bytes after the last written byte
  // 'src[4 * width - 4]', because we don't know if alpha is the first or the
  // last byte of the quadruplet.
  const int limit = (width - 1) & ~7;

  for (j = 0; j < height; ++j) {
    const __m128i* src = (const __m128i*)argb;
    for (i = 0; i < limit; i += 8) {
      // load 32 argb bytes
      const __m128i a0 = _mm_loadu_si128(src + 0);
      const __m128i a1 = _mm_loadu_si128(src + 1);
      const __m128i b0 = _mm_and_si128(a0, a_mask);
      const __m128i b1 = _mm_and_si128(a1, a_mask);
      const __m128i c0 = _mm_packs_epi32(b0, b1);
      const __m128i d0 = _mm_packus_epi16(c0, c0);
      // store
      _mm_storel_epi64((__m128i*)&alpha[i], d0);
      // accumulate eight alpha 'and' in parallel
      all_alphas = _mm_and_si128(all_alphas, d0);
      src += 2;
    }
    for (; i < width; ++i) {
      const uint32_t alpha_value = argb[4 * i];
      alpha[i] = alpha_value;
      alpha_and &= alpha_value;
    }
    argb += argb_stride;
    alpha += alpha_stride;
  }
  // Combine the eight alpha 'and' into a 8-bit mask.
  alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff));
  return (alpha_and == 0xff);
}