コード例 #1
0
ファイル: probe.c プロジェクト: jyu1992/SplashDB
uint32_t probe(uint32_t key)
{
  /* create a vector with all values initialized to key */
  __m128i keyVector = _mm_set1_epi32(key);

  /* find the appropriate buckets using multiplicative hashing */
  __m128i bucketIds = _mm_mullo_epi32(keyVector, hashes.vec128);
  bucketIds  = _mm_srli_epi32(bucketIds, hashShift);
  size_t b0 = _mm_extract_epi32(bucketIds, 0);
  size_t b1 = _mm_extract_epi32(bucketIds, 1);

  __m128i keys;
  __m128i values0, values1;

  /* load keys, compare with lookup key (to produce a bitmask).
   * AND the result with the corresponding values. */
  keys = _mm_load_si128((const __m128i *) buckets[b0].keys);
  keys = _mm_cmpeq_epi32(keys, keyVector);
  values0 = _mm_load_si128((const __m128i *) buckets[b0].values);
  values0 = _mm_and_si128(values0, keys);

  keys = _mm_load_si128((const __m128i *) buckets[b1].keys);
  keys = _mm_cmpeq_epi32(keys, keyVector);
  values1 = _mm_load_si128((const __m128i *) buckets[b1].values);
  values1 = _mm_and_si128(values1, keys);

  /* OR all of the (key AND value) pairs to get result */
  union QuadInt qi;
  qi.vec128 = _mm_or_si128(values0, values1);
  qi.vec64[0] = _mm_or_si64(qi.vec64[0], qi.vec64[1]);
  return qi.arr[0] | qi.arr[1];
}
コード例 #2
0
ファイル: rewind.cpp プロジェクト: Alcaro/minir
// There's no equivalent in libc, you'd think so ... std::mismatch exists, but it's not optimized at all. :(
static inline size_t find_change(const uint16_t * a, const uint16_t * b)
{
	const __m128i * a128=(const __m128i*)a;
	const __m128i * b128=(const __m128i*)b;
	
	while (true)
	{
		__m128i v0 = _mm_loadu_si128(a128);
		__m128i v1 = _mm_loadu_si128(b128);
		__m128i c = _mm_cmpeq_epi32(v0, v1);
		uint32_t mask = _mm_movemask_epi8(c);
		a128++;
		b128++;
		__m128i v0b = _mm_loadu_si128(a128);
		__m128i v1b = _mm_loadu_si128(b128);
		__m128i cb = _mm_cmpeq_epi32(v0b, v1b);
		uint32_t maskb = _mm_movemask_epi8(cb);
		if (mask != 0xffff || maskb != 0xffff) // Something has changed, figure out where.
		{
			if (mask == 0xffff) mask=maskb;
			else a128--;//ignore b128 since we'll return anyways
			size_t ret=(((char*)a128-(char*)a) | (compat_ctz(~mask))) >> 1;
			return (ret | (a[ret]==b[ret]));
		}
		a128++;
		b128++;
	}
}
コード例 #3
0
ファイル: ZZClut.cpp プロジェクト: tsiru/pcsx2
/*__forceinline*/ bool Cmp_ClutBuffer_GSMem<u32>(u32* GSmem, u32 csa, u32 clutsize)
{
    u64* _GSmem = (u64*) GSmem;
    u64* clut = (u64*)GetClutBufferAddress<u32>(csa);

    while(clutsize > 0) {
#ifdef ZEROGS_SSE2
        // Note: local memory datas are swizzles
        __m128i GSmem_0 = _mm_load_si128((__m128i*)_GSmem);   // 9  8  1 0
        __m128i GSmem_1 = _mm_load_si128((__m128i*)_GSmem+1); // 11 10 3 2
        __m128i GSmem_2 = _mm_load_si128((__m128i*)_GSmem+2); // 13 12 5 4
        __m128i GSmem_3 = _mm_load_si128((__m128i*)_GSmem+3); // 15 14 7 6

        __m128i clut_0 = _mm_load_si128((__m128i*)clut);
        __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
        __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
        __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);

        __m128i result = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_0, GSmem_1), clut_0);

        __m128i result_tmp = _mm_cmpeq_epi32(_mm_unpacklo_epi64(GSmem_2, GSmem_3), clut_1);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_0, GSmem_1), clut_2);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi32(_mm_unpackhi_epi64(GSmem_2, GSmem_3), clut_3);
        result = _mm_and_si128(result, result_tmp);

        u32 result_int = _mm_movemask_epi8(result);
        if (result_int != 0xFFFF)
            return true;
#else
        // I see no point to keep an mmx version. SSE2 versions is probably faster.
        // Keep a slow portable C version for reference/debug
        // Note: local memory datas are swizzles
        if (clut[0] != _GSmem[0] || clut[1] != _GSmem[2] || clut[2] != _GSmem[4] || clut[3] != _GSmem[6]
                || clut[4] != _GSmem[1] || clut[5] != _GSmem[3] || clut[6] != _GSmem[5] || clut[7] != _GSmem[7])
            return true;
#endif

        // go to the next memory block
        _GSmem += 32;

        // go back to the previous memory block then down one memory column
        if (clutsize & 0x40) {
            _GSmem -= (64-8);
        }
        // In case previous operation (down one column) cross the block boundary
        // Go to the next block
        if (clutsize == 0x240) {
            _GSmem += 32;
        }

        clut += 8;
        clutsize -= 64;
    }

    return false;
}
コード例 #4
0
ファイル: qimage_sse2.cpp プロジェクト: 13W/phantomjs
QT_BEGIN_NAMESPACE

bool convert_ARGB_to_ARGB_PM_inplace_sse2(QImageData *data, Qt::ImageConversionFlags)
{
    Q_ASSERT(data->format == QImage::Format_ARGB32);

    // extra pixels on each line
    const int spare = data->width & 3;
    // width in pixels of the pad at the end of each line
    const int pad = (data->bytes_per_line >> 2) - data->width;
    const int iter = data->width >> 2;
    int height = data->height;

    const __m128i alphaMask = _mm_set1_epi32(0xff000000);
    const __m128i nullVector = _mm_setzero_si128();
    const __m128i half = _mm_set1_epi16(0x80);
    const __m128i colorMask = _mm_set1_epi32(0x00ff00ff);

    __m128i *d = reinterpret_cast<__m128i*>(data->data);
    while (height--) {
        const __m128i *end = d + iter;

        for (; d != end; ++d) {
            const __m128i srcVector = _mm_loadu_si128(d);
            const __m128i srcVectorAlpha = _mm_and_si128(srcVector, alphaMask);
            if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, alphaMask)) == 0xffff) {
                // opaque, data is unchanged
            } else if (_mm_movemask_epi8(_mm_cmpeq_epi32(srcVectorAlpha, nullVector)) == 0xffff) {
                // fully transparent
                _mm_storeu_si128(d, nullVector);
            } else {
                __m128i alphaChannel = _mm_srli_epi32(srcVector, 24);
                alphaChannel = _mm_or_si128(alphaChannel, _mm_slli_epi32(alphaChannel, 16));

                __m128i result;
                BYTE_MUL_SSE2(result, srcVector, alphaChannel, colorMask, half);
                result = _mm_or_si128(_mm_andnot_si128(alphaMask, result), srcVectorAlpha);
                _mm_storeu_si128(d, result);
            }
        }

        QRgb *p = reinterpret_cast<QRgb*>(d);
        QRgb *pe = p+spare;
        for (; p != pe; ++p) {
            if (*p < 0x00ffffff)
                *p = 0;
            else if (*p < 0xff000000)
                *p = PREMUL(*p);
        }

        d = reinterpret_cast<__m128i*>(p+pad);
    }

    data->format = QImage::Format_ARGB32_Premultiplied;
    return true;
}
コード例 #5
0
ファイル: lossless_enc_sse2.c プロジェクト: Achraf33/opencv
static int VectorMismatch_SSE2(const uint32_t* const array1,
                               const uint32_t* const array2, int length) {
  int match_len;

  if (length >= 12) {
    __m128i A0 = _mm_loadu_si128((const __m128i*)&array1[0]);
    __m128i A1 = _mm_loadu_si128((const __m128i*)&array2[0]);
    match_len = 0;
    do {
      // Loop unrolling and early load both provide a speedup of 10% for the
      // current function. Also, max_limit can be MAX_LENGTH=4096 at most.
      const __m128i cmpA = _mm_cmpeq_epi32(A0, A1);
      const __m128i B0 =
          _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
      const __m128i B1 =
          _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
      if (_mm_movemask_epi8(cmpA) != 0xffff) break;
      match_len += 4;

      {
        const __m128i cmpB = _mm_cmpeq_epi32(B0, B1);
        A0 = _mm_loadu_si128((const __m128i*)&array1[match_len + 4]);
        A1 = _mm_loadu_si128((const __m128i*)&array2[match_len + 4]);
        if (_mm_movemask_epi8(cmpB) != 0xffff) break;
        match_len += 4;
      }
    } while (match_len + 12 < length);
  } else {
    match_len = 0;
    // Unroll the potential first two loops.
    if (length >= 4 &&
        _mm_movemask_epi8(_mm_cmpeq_epi32(
            _mm_loadu_si128((const __m128i*)&array1[0]),
            _mm_loadu_si128((const __m128i*)&array2[0]))) == 0xffff) {
      match_len = 4;
      if (length >= 8 &&
          _mm_movemask_epi8(_mm_cmpeq_epi32(
              _mm_loadu_si128((const __m128i*)&array1[4]),
              _mm_loadu_si128((const __m128i*)&array2[4]))) == 0xffff) {
        match_len = 8;
      }
    }
  }

  while (match_len < length && array1[match_len] == array2[match_len]) {
    ++match_len;
  }
  return match_len;
}
コード例 #6
0
    SIMDValue SIMDFloat32x4Operation::OpMaxNum(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue mask, mask2, t1, t2;

        // This is the correct result or b if either is NaN or both are +/-0.0
        x86Result.m128_value = _mm_max_ps(tmpaValue.m128_value, tmpbValue.m128_value);
        // Find NaNs in b
        mask.m128_value = _mm_cmpunord_ps(tmpbValue.m128_value, tmpbValue.m128_value);
        // Find +0.0 in a
        mask2.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, X86_ALL_ZEROS.m128i_value);
        // mask2 is -0.0 where a is +0.0
        mask2.m128_value = _mm_and_ps(mask2.m128_value, X86_TWO_31_I4.m128_value);
        // For lanes where a is +0.0, the result is either correct (positive), or b which is possibly -0.0
        // Safe to force sign to positive for those lanes, +0.0 becomes -0.0.
        x86Result.m128_value = _mm_andnot_ps(mask2.m128_value, x86Result.m128_value);
        // For NaNs in b, choose a, else keep result.
        t1.m128_value = _mm_and_ps(tmpaValue.m128_value, mask.m128_value);
        t2.m128_value = _mm_andnot_ps(mask.m128_value, x86Result.m128_value);
        x86Result.m128_value = _mm_or_ps(t1.m128_value, t2.m128_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
コード例 #7
0
ファイル: sse2-builtins.c プロジェクト: lucasmrthomaz/clang
__m128i test_mm_cmpeq_epi32(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_cmpeq_epi32
  // DAG: icmp eq <4 x i32>
  //
  // ASM-LABEL: test_mm_cmpeq_epi32
  // ASM: pcmpeqd
  return _mm_cmpeq_epi32(A, B);
}
コード例 #8
0
bool SseBitcoinSha256::FindNonce(uint32_t& nonce) {

#if UCFG_BITCOIN_ASM
	CSseData& sseData = SseData();
	return CalcSha256Sse(sseData.m_4w, sseData.m_4midstate, m_midstate_after_3, UCFG_BITCOIN_NPAR, nonce);

#else
	__m128i *m_4w1 = &m_4w[16*UCFG_BITCOIN_WAY];	//!!!?

	__m128i offset = _mm_set_epi32(3, 2, 1, 0);
	for (int i=0; i<UCFG_BITCOIN_NPAR; i+=UCFG_BITCOIN_WAY, nonce+=UCFG_BITCOIN_WAY) {
		m_4w[3] = _mm_set1_epi32(nonce)+offset;
		for (int j=0; j<8; ++j)
			m_4w1[j] = _mm_set1_epi32(m_midstate_after_3[j]);
#if UCFG_BITCOIN_ASM
		CalcSha256Sse(m_4w, m_4midstate, m_4w1, 18, 3, 64);
#else
		CalcRounds(m_4w, m_4midstate, m_4w1, 18, 3, 64);
#endif

#if UCFG_BITCOIN_WAY==6 || UCFG_BITCOIN_WAY==8
		__m128i v[16];
		for (int j=0; j<8; ++j)
			v[j*2] = v[j*2+1] = _mm_set1_epi32(g_sha256_hinit[j]);
#else
		__m128i v[8];
		for (int j=0; j<8; ++j)
			v[j] = _mm_set1_epi32(g_sha256_hinit[j]);
#endif
#if UCFG_BITCOIN_ASM
		__m128i e = CalcSha256Sse(m_4w1, v, v, 16, 0, 61);
#else
		__m128i e = CalcRounds(m_4w1, v, v, 16, 0, 61);				// We enough this
#endif
		__m128i p = _mm_cmpeq_epi32(e + _mm_set1_epi32(g_sha256_hinit[7]), _mm_setzero_si128());
		uint64_t *p64 = (uint64_t*)&p;
		if (p64[0] | p64[1]) {
			if (_mm_extract_epi16(p, 0) != 0)
				return true;
			if (_mm_extract_epi16(p, 2) != 0) {
				nonce += 1;
				return true;
			}
			if (_mm_extract_epi16(p, 4) != 0) {
				nonce += 2;
				return true;
			}
			if (_mm_extract_epi16(p, 6) != 0) {
				nonce += 3;
				return true;
			}
		}

	}
	return false;
#endif
}
コード例 #9
0
    SIMDValue SIMDInt32x4Operation::OpEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b?

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
コード例 #10
0
unsigned int vp9_sad3x16_sse2(
  const unsigned char *src_ptr,
  int  src_stride,
  const unsigned char *ref_ptr,
  int  ref_stride) {
  int r;
  __m128i s0, s1, s2, s3;
  __m128i r0, r1, r2, r3;
  __m128i sad = _mm_setzero_si128();
  __m128i mask;
  const int offset = (uintptr_t)src_ptr & 3;

  /* In current use case, the offset is 1 if CONFIG_SUBPELREFMV is off.
   * Here, for offset=1, we adjust src_ptr to be 4-byte aligned. Then, movd
   * takes much less time.
   */
  if (offset == 1)
    src_ptr -= 1;

  /* mask = 0xffffffffffff0000ffffffffffff0000 */
  mask = _mm_cmpeq_epi32(sad, sad);
  mask = _mm_slli_epi64(mask, 16);

  for (r = 0; r < 16; r += 4) {
    s0 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 0 * src_stride));
    s1 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 1 * src_stride));
    s2 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 2 * src_stride));
    s3 = _mm_cvtsi32_si128 (*(const int *)(src_ptr + 3 * src_stride));
    r0 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 0 * ref_stride));
    r1 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 1 * ref_stride));
    r2 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 2 * ref_stride));
    r3 = _mm_cvtsi32_si128 (*(const int *)(ref_ptr + 3 * ref_stride));

    s0 = _mm_unpacklo_epi8(s0, s1);
    r0 = _mm_unpacklo_epi8(r0, r1);
    s2 = _mm_unpacklo_epi8(s2, s3);
    r2 = _mm_unpacklo_epi8(r2, r3);
    s0 = _mm_unpacklo_epi64(s0, s2);
    r0 = _mm_unpacklo_epi64(r0, r2);

    // throw out extra byte
    if (offset == 1)
      s0 = _mm_and_si128(s0, mask);
    else
      s0 = _mm_slli_epi64(s0, 16);
    r0 = _mm_slli_epi64(r0, 16);

    sad = _mm_add_epi16(sad, _mm_sad_epu8(s0, r0));

    src_ptr += src_stride*4;
    ref_ptr += ref_stride*4;
  }

  sad = _mm_add_epi16(sad,  _mm_srli_si128(sad, 8));
  return _mm_cvtsi128_si32(sad);
}
コード例 #11
0
int SSEBinSearchBlock::search(uint32_t key) const {

    const __m128i keys = _mm_set1_epi32(key);
    __m128i v;

    int limit = data.size() - 1;
    int a = 0;
    int b = limit;

    while (a <= b) {
        const int c = (a + b)/2;

        if (data[c] == key) {
            return c;
        }

        if (key < data[c]) {
            b = c - 1;

            if (b >= 4) {
                v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[b - 4]));
                v = _mm_cmpeq_epi32(v, keys);
                const uint16_t mask = _mm_movemask_epi8(v);
                if (mask) {
                    return b - 4 + __builtin_ctz(mask)/4;
                }
            }
        } else {
            a = c + 1;

            if (a + 4 < limit) {
                v = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&data[a]));
                v = _mm_cmpeq_epi32(v, keys);
                const uint16_t mask = _mm_movemask_epi8(v);
                if (mask) {
                    return a + __builtin_ctz(mask)/4;
                }
            }
        }
    }

    return -1;
}
コード例 #12
0
static inline __m128i _mm_setone_si128()
{
	/*
	 * Clang には _mm_undefined_si128 は無いらしい。
	 * Visual C++ では _mm_setzero_si128 に置き換えられる。
	 * どうせ処理の他の部分で all 0 を使うので、この処理でも問題なかろう。
	 */
	__m128i x = _mm_setzero_si128(); // _mm_undefined_si128();
	return _mm_cmpeq_epi32(x, x);
}
コード例 #13
0
ファイル: mmintrin64.c プロジェクト: TheRyuu/ffdshow
__m64 _m_pcmpeqd(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_cmpeq_epi32(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
コード例 #14
0
/**
 * Convert a chroma-keyed image to standard ARGB32.
 * SSE2-optimized version.
 *
 * This operates on the image itself, and does not return
 * a duplicated image with the adjusted image.
 *
 * NOTE: The image *must* be ARGB32.
 *
 * @param key Chroma key color.
 * @return 0 on success; negative POSIX error code on error.
 */
int rp_image::apply_chroma_key_sse2(uint32_t key)
{
	RP_D(rp_image);
	rp_image_backend *const backend = d->backend;
	assert(backend->format == FORMAT_ARGB32);
	if (backend->format != FORMAT_ARGB32) {
		// ARGB32 only.
		return -EINVAL;
	}

	const unsigned int diff = (backend->stride - this->row_bytes()) / sizeof(uint32_t);
	uint32_t *img_buf = static_cast<uint32_t*>(backend->data());

	// SSE2 constants.
	const __m128i xmm_key = _mm_setr_epi32(key, key, key, key);
	const __m128i xmm_ones = _mm_setr_epi32(0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF,0xFFFFFFFF);

	for (unsigned int y = static_cast<unsigned int>(backend->height); y > 0; y--) {
		// Process 4 pixels per iteration with SSE2.
		unsigned int x = static_cast<unsigned int>(backend->width);
		for (; x > 3; x -= 4, img_buf += 4) {
			__m128i *xmm_data = reinterpret_cast<__m128i*>(img_buf);

			// Compare the pixels to the chroma key.
			// Equal values will be 0xFFFFFFFF.
			// Non-equal values will be 0x00000000.
			__m128i res = _mm_cmpeq_epi32(*xmm_data, xmm_key);

			// Invert the results and mask the original data.
			// Original data will now have 00s for chroma-keyed pixels.
			*xmm_data = _mm_and_si128(_mm_xor_si128(res, xmm_ones), *xmm_data);
		}

		// Remaining pixels.
		for (; x > 0; x--, img_buf++) {
			if (*img_buf == key) {
				*img_buf = 0;
			}
		}

		// Next row.
		img_buf += diff;
	}

	// Adjust sBIT.
	// TODO: Only if transparent pixels were found.
	if (d->has_sBIT && d->sBIT.alpha == 0) {
		d->sBIT.alpha = 1;
	}

	// Chroma key applied.
	return 0;
}
コード例 #15
0
    SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws)
    {
        X86SIMDValue x86Result = { 0 };
        X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value);
        X86SIMDValue temp, temp2;
        X86SIMDValue two_31_f4, two_31_i4;
        int mask = 0;

        // any lanes < 0 ?
        temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value);
        mask = _mm_movemask_ps(temp.m128_value);
        // negative value are out of range, caller should throw Range Error
        if (mask)
        {
            throws = true;
            return X86SIMDValue::ToSIMDValue(x86Result);
        }
        // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31.
        // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back.
        _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value);
        // any lanes >= 2^31 ?
        temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value);
        // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise
        two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value);
        // subtract 2^31 from lanes >= 2^31, unchanged otherwise.
        v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value);

        // CVTTPS2DQ
        x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value);

        // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment)
        temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ?
        mask = _mm_movemask_ps(temp2.m128_value);
        if (mask)
        {
            throws = true;
            return X86SIMDValue::ToSIMDValue(x86Result);
        }
        // we pass range check

        // add 2^31 values back to adjusted values.
        // Use first bit from the 2^31 float mask (0x4f000...0 << 1)
        // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted
        _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value);
        two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1);
        two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value);
        // add 2^31 back to adjusted values
        // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow.
        x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
コード例 #16
0
ファイル: sse2_operators.hpp プロジェクト: rayfill/risa_gl
static bool equals(const __m128i& lhs, const __m128i& rhs)
{
	char bytes[16];
	__m128i result = _mm_cmpeq_epi32(lhs, rhs);
	_mm_storeu_si128(reinterpret_cast<__m128i*>(bytes), result);
	for (int offset = 0; offset < 16; ++offset)
	{
		if (bytes[offset] == 0)
			return false;
	}

	return true;
}
コード例 #17
0
ファイル: SkXfermode_opts_SSE2.cpp プロジェクト: sheuan/skia
static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);

    // if (dc == da)
    __m128i cmp1 = _mm_cmpeq_epi32(dc, da);
    __m128i tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    __m128i rc1 = _mm_add_epi32(tmp1, tmp2);
    rc1 = _mm_add_epi32(rc1, tmp3);
    rc1 = clamp_div255round_SSE2(rc1);
    rc1 = _mm_and_si128(cmp1, rc1);

    // else if (0 == sc)
    __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128());
    __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa);
    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    rc2 = _mm_and_si128(cmp, rc2);

    // else
    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    __m128i tmp4 = _mm_sub_epi32(da, dc);
    tmp4 = Multiply32_SSE2(tmp4, sa);
    tmp4 = shim_mm_div_epi32(tmp4, sc);

    __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4));
    tmp5 = Multiply32_SSE2(sa, tmp5);
    __m128i rc3 = _mm_add_epi32(tmp5, tmp2);
    rc3 = _mm_add_epi32(rc3, tmp3);
    rc3 = clamp_div255round_SSE2(rc3);
    rc3 = _mm_andnot_si128(cmp3, rc3);

    __m128i rc = _mm_or_si128(rc1, rc2);
    rc = _mm_or_si128(rc, rc3);

    return rc;
}
コード例 #18
0
ファイル: lossless_enc_sse2.c プロジェクト: 03050903/godot
static float CombinedShannonEntropy(const int X[256], const int Y[256]) {
  int i;
  double retval = 0.;
  int sumX, sumXY;
  int32_t tmp[4];
  __m128i zero = _mm_setzero_si128();
  // Sums up X + Y, 4 ints at a time (and will merge it at the end for sumXY).
  __m128i sumXY_128 = zero;
  __m128i sumX_128 = zero;

  for (i = 0; i < 256; i += 4) {
    const __m128i x = _mm_loadu_si128((const __m128i*)(X + i));
    const __m128i y = _mm_loadu_si128((const __m128i*)(Y + i));

    // Check if any X is non-zero: this actually provides a speedup as X is
    // usually sparse.
    if (_mm_movemask_epi8(_mm_cmpeq_epi32(x, zero)) != 0xFFFF) {
      const __m128i xy_128 = _mm_add_epi32(x, y);
      sumXY_128 = _mm_add_epi32(sumXY_128, xy_128);

      sumX_128 = _mm_add_epi32(sumX_128, x);

      // Analyze the different X + Y.
      _mm_storeu_si128((__m128i*)tmp, xy_128);

      ANALYZE_XY(0);
      ANALYZE_XY(1);
      ANALYZE_XY(2);
      ANALYZE_XY(3);
    } else {
      // X is fully 0, so only deal with Y.
      sumXY_128 = _mm_add_epi32(sumXY_128, y);

      ANALYZE_X_OR_Y(Y, 0);
      ANALYZE_X_OR_Y(Y, 1);
      ANALYZE_X_OR_Y(Y, 2);
      ANALYZE_X_OR_Y(Y, 3);
    }
  }

  // Sum up sumX_128 to get sumX.
  _mm_storeu_si128((__m128i*)tmp, sumX_128);
  sumX = tmp[3] + tmp[2] + tmp[1] + tmp[0];

  // Sum up sumXY_128 to get sumXY.
  _mm_storeu_si128((__m128i*)tmp, sumXY_128);
  sumXY = tmp[3] + tmp[2] + tmp[1] + tmp[0];

  retval += VP8LFastSLog2(sumX) + VP8LFastSLog2(sumXY);
  return (float)retval;
}
コード例 #19
0
ファイル: SkXfermode_opts_SSE2.cpp プロジェクト: sheuan/skia
static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                           const __m128i& sa, const __m128i& da) {
    __m128i diff = _mm_sub_epi32(sa, sc);
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);

    // if (0 == dc)
    __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128());
    __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida));

    // else if (0 == diff)
    __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128());
    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    __m128i tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    __m128i rc2 = _mm_add_epi32(tmp1, tmp2);
    rc2 = _mm_add_epi32(rc2, tmp3);
    rc2 = clamp_div255round_SSE2(rc2);
    rc2 = _mm_and_si128(cmp, rc2);

    // else
    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    __m128i value = _mm_mullo_epi16(dc, sa);
    diff = shim_mm_div_epi32(value, diff);

    __m128i tmp4 = SkMin32_SSE2(da, diff);
    tmp4 = Multiply32_SSE2(sa, tmp4);
    __m128i rc3 = _mm_add_epi32(tmp4, tmp2);
    rc3 = _mm_add_epi32(rc3, tmp3);
    rc3 = clamp_div255round_SSE2(rc3);
    rc3 = _mm_andnot_si128(cmp3, rc3);

    __m128i rc = _mm_or_si128(rc1, rc2);
    rc = _mm_or_si128(rc, rc3);

    return rc;
}
コード例 #20
0
ファイル: rawSHA256_ng_fmt_plug.c プロジェクト: mimaun/Rose
static int cmp_all (void *binary, int count)
{
#ifdef _OPENMP
    int i;

    for (i = 0; i < count; i++)
        if (((uint32_t *) binary)[0] == crypt_key[0][i])
             return 1;
    return 0;
#else
    static const __m128i zero = {0};

    __m128i tmp;
    __m128i bin;
    __m128i digest;

    digest = _mm_load_si128 ((__m128i *) crypt_key[0]);
    bin    = _mm_set1_epi32 (((uint32_t *) binary)[0]);
    tmp    = _mm_cmpeq_epi32 (bin, digest);

    return _mm_movemask_epi8 (_mm_cmpeq_epi32 (tmp, zero)) != 0xffff;
#endif
}
コード例 #21
0
ファイル: f16c_sse2.cpp プロジェクト: dubhater/zimg
inline FORCE_INLINE __m128 mm_cvtph_ps(__m128i x)
{
	__m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)113 << 23));
	__m128i shift_exp = _mm_set1_epi32(0x7C00UL << 13);
	__m128i sign_mask = _mm_set1_epi32(0x8000U);
	__m128i mant_mask = _mm_set1_epi32(0x7FFF);
	__m128i exp_adjust = _mm_set1_epi32((127UL - 15UL) << 23);
	__m128i exp_adjust_nan = _mm_set1_epi32((127UL - 16UL) << 23);
	__m128i exp_adjust_denorm = _mm_set1_epi32(1UL << 23);
	__m128i zero = _mm_set1_epi16(0);

	__m128i exp, ret, ret_nan, ret_denorm, sign, mask0, mask1;

	x = _mm_unpacklo_epi16(x, zero);

	ret = _mm_and_si128(x, mant_mask);
	ret = _mm_slli_epi32(ret, 13);
	exp = _mm_and_si128(shift_exp, ret);
	ret = _mm_add_epi32(ret, exp_adjust);

	mask0 = _mm_cmpeq_epi32(exp, shift_exp);
	mask1 = _mm_cmpeq_epi32(exp, zero);

	ret_nan = _mm_add_epi32(ret, exp_adjust_nan);
	ret_denorm = _mm_add_epi32(ret, exp_adjust_denorm);
	ret_denorm = _mm_castps_si128(_mm_sub_ps(_mm_castsi128_ps(ret_denorm), magic));

	sign = _mm_and_si128(x, sign_mask);
	sign = _mm_slli_epi32(sign, 16);

	ret = mm_blendv_ps(ret_nan, ret, mask0);
	ret = mm_blendv_ps(ret_denorm, ret, mask1);

	ret = _mm_or_si128(ret, sign);
	return _mm_castsi128_ps(ret);
}
コード例 #22
0
    SIMDValue SIMDUint32x4Operation::OpLessThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue signBits;
        signBits.m128i_value = _mm_set1_epi32(0x80000000);

        // Signed comparison of unsigned ints can be done if the ints have the "sign" bit xored with 1
        tmpaValue.m128i_value = _mm_xor_si128(tmpaValue.m128i_value, signBits.m128i_value);
        tmpbValue.m128i_value = _mm_xor_si128(tmpbValue.m128i_value, signBits.m128i_value);
        x86Result.m128i_value = _mm_cmplt_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a < b?
        tmpaValue.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b?
        x86Result.m128i_value = _mm_or_si128(x86Result.m128i_value, tmpaValue.m128i_value);   // result = (a<b)|(a==b)

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
コード例 #23
0
 bool test(const index_t & kmer) const
 {
     __m128 __attribute__ ((aligned (16))) zero = _mm_setzero_si128();
     const size_t BitsPerElement = sizeof(block_t) * 8;
     const Hash hashfunction = Hash();
     kmer_t hashvalue = kmer;
     for (int hcount = this->h; hcount > 0; hcount--)
     {
         hashvalue = hashfunction(hashvalue);
         // we expect the compiler to automatically turn this into a shift because it's a const power of two
         size_t offset = (hashvalue % this->m) / BitsPerElement;
         if (_mm_movemask_epi8(
             _mm_cmpeq_epi32(
             _mm_and_ps(bitarray[offset],masks[hashvalue & (BitsPerElement-1)]),zero)) != 0xFFFF) return false;
     }
     return true;
 }
コード例 #24
0
ファイル: render.cpp プロジェクト: DominusExult/dosbox-x
static void RENDER_StartLineHandler(const void * s) {
	if (s) {
		const Bitu *src = (Bitu*)s;
		Bitu *cache = (Bitu*)(render.scale.cacheRead);
		Bits count = render.src.start;
#if defined(__SSE__)
		if(sse2_available) {
#if defined (_MSC_VER)
#define SIZEOF_INT_P 4
#endif
			static const Bitu simd_inc = 16/SIZEOF_INT_P;
			while (count >= (Bits)simd_inc) {
				__m128i v = _mm_loadu_si128((const __m128i*)src);
				__m128i c = _mm_loadu_si128((const __m128i*)cache);
				__m128i cmp = _mm_cmpeq_epi32(v, c);
				if (GCC_UNLIKELY(_mm_movemask_epi8(cmp) != 0xFFFF))
					goto cacheMiss;
				count-=simd_inc; src+=simd_inc; cache+=simd_inc;
			}
		}
#endif
		while (count) {
			if (GCC_UNLIKELY(src[0] != cache[0]))
				goto cacheMiss;
			count--; src++; cache++;
		}
	}
/* cacheHit */
	render.scale.cacheRead += render.scale.cachePitch;
	Scaler_ChangedLines[0] += Scaler_Aspect[ render.scale.inLine ];
	render.scale.inLine++;
	render.scale.outLine++;
	return;
cacheMiss:
	if (!GFX_StartUpdate( render.scale.outWrite, render.scale.outPitch )) {
		RENDER_DrawLine = RENDER_EmptyLineHandler;
		return;
	}
	render.scale.outWrite += render.scale.outPitch * Scaler_ChangedLines[0];
	RENDER_DrawLine = render.scale.lineHandler;
	RENDER_DrawLine( s );
}
コード例 #25
0
void HighPassFilter::setFlaggedValuesToZeroAndMakeWeightsSSE(const Image2DCPtr &inputImage, const Image2DPtr &outputImage, const Mask2DCPtr &inputMask, const Image2DPtr &weightsOutput)
{
	const size_t width = inputImage->Width();
	const __m128i zero4i = _mm_set_epi32(0, 0, 0, 0);
	const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
	const __m128 one4 = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
	for(size_t y=0;y<inputImage->Height();++y)
	{
		const bool *rowPtr = inputMask->ValuePtr(0, y);
		const float *inputPtr = inputImage->ValuePtr(0, y);
		float *outputPtr = outputImage->ValuePtr(0, y);
		float *weightsPtr = weightsOutput->ValuePtr(0, y);
		const float *end = inputPtr + width;
		while(inputPtr < end)
		{
			
			// Assign each integer to one bool in the mask
			// Convert false to 0xFFFFFFFF and true to 0
			__m128 conditionMask = _mm_castsi128_ps(
				_mm_cmpeq_epi32(_mm_set_epi32(rowPtr[3] || !isfinite(inputPtr[3]), rowPtr[2] || !isfinite(inputPtr[2]),
																			rowPtr[1] || !isfinite(inputPtr[1]), rowPtr[0] || !isfinite(inputPtr[0])),
												zero4i));
			
			_mm_store_ps(weightsPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, one4),
				_mm_andnot_ps(conditionMask, zero4)
			));
			_mm_store_ps(outputPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, _mm_load_ps(inputPtr)),
				_mm_andnot_ps(conditionMask, zero4)
			));
			
			rowPtr += 4;
			outputPtr += 4;
			inputPtr += 4;
			weightsPtr += 4;
		}
	}
}
コード例 #26
0
ファイル: rewind.c プロジェクト: PCGeekBrain/RetroArch
static INLINE size_t find_change(const uint16_t *a, const uint16_t *b)
{
   const __m128i *a128 = (const __m128i*)a;
   const __m128i *b128 = (const __m128i*)b;
	
   for (;;)
   {
      __m128i v0    = _mm_loadu_si128(a128);
      __m128i v1    = _mm_loadu_si128(b128);
      __m128i c     = _mm_cmpeq_epi32(v0, v1);
      uint32_t mask = _mm_movemask_epi8(c);

      if (mask != 0xffff) /* Something has changed, figure out where. */
      {
         size_t ret = (((uint8_t*)a128 - (uint8_t*)a) |
               (compat_ctz(~mask))) >> 1;
			return ret | (a[ret] == b[ret]);
      }

      a128++;
      b128++;
   }
コード例 #27
0
ファイル: main.cpp プロジェクト: mvm/nyannyan
IplImage* calcMask(IplImage* image)
{
    IplImage* mask = cvCloneImage(image);

    um128i *pMask = (um128i*)mask->imageData;
    um128i *pImg = (um128i*)image->imageData;

    um128i croma;
    um128i unos;
    unos.mm = _mm_set1_epi8(0xff);
    croma.mm = _mm_set1_epi32(*((unsigned int *)pImg));

    for(int i = 0; i < image->imageSize; i += 16)
    {
        um128i px = *pImg++;
        px.mm = _mm_cmpeq_epi32(px.mm, croma.mm);
        px.mm = _mm_xor_si128(px.mm, unos.mm);

        *pMask++ = px;
    }

    return mask;
}
コード例 #28
0
ファイル: f16c_sse2.cpp プロジェクト: dubhater/zimg
inline FORCE_INLINE __m128i mm_cvtps_ph(__m128 x)
{
	__m128 magic = _mm_castsi128_ps(_mm_set1_epi32((uint32_t)15 << 23));
	__m128i inf = _mm_set1_epi32((uint32_t)255UL << 23);
	__m128i f16inf = _mm_set1_epi32((uint32_t)31UL << 23);
	__m128i sign_mask = _mm_set1_epi32(0x80000000UL);
	__m128i round_mask = _mm_set1_epi32(~0x0FFFU);

	__m128i ret_0x7E00 = _mm_set1_epi32(0x7E00);
	__m128i ret_0x7C00 = _mm_set1_epi32(0x7C00);

	__m128i f, sign, ge_inf, eq_inf;

	f = _mm_castps_si128(x);
	sign = _mm_and_si128(f, sign_mask);
	f = _mm_xor_si128(f, sign);

	ge_inf = _mm_cmpgt_epi32(f, inf);
	eq_inf = _mm_cmpeq_epi32(f, inf);

	f = _mm_and_si128(f, round_mask);
	f = _mm_castps_si128(_mm_mul_ps(_mm_castsi128_ps(f), magic));
	f = _mm_sub_epi32(f, round_mask);

	f = mm_min_epi32(f, f16inf);
	f = _mm_srli_epi32(f, 13);

	f = mm_blendv_ps(ret_0x7E00, f, ge_inf);
	f = mm_blendv_ps(ret_0x7C00, f, eq_inf);

	sign = _mm_srli_epi32(sign, 16);
	f = _mm_or_si128(f, sign);

	f = mm_packus_epi32(f, _mm_setzero_si128());
	return f;
}
コード例 #29
0
/*****************************************************************************
 * This function utilises 3 properties of the cost function lookup tables,   *
 * constructed in using 'cal_nmvjointsadcost' and 'cal_nmvsadcosts' in       *
 * vp9_encoder.c.                                                            *
 * For the joint cost:                                                       *
 *   - mvjointsadcost[1] == mvjointsadcost[2] == mvjointsadcost[3]           *
 * For the component costs:                                                  *
 *   - For all i: mvsadcost[0][i] == mvsadcost[1][i]                         *
 *         (Equal costs for both components)                                 *
 *   - For all i: mvsadcost[0][i] == mvsadcost[0][-i]                        *
 *         (Cost function is even)                                           *
 * If these do not hold, then this function cannot be used without           *
 * modification, in which case you can revert to using the C implementation, *
 * which does not rely on these properties.                                  *
 *****************************************************************************/
int vp9_diamond_search_sad_avx(const MACROBLOCK *x,
                               const search_site_config *cfg,
                               MV *ref_mv, MV *best_mv, int search_param,
                               int sad_per_bit, int *num00,
                               const vp9_variance_fn_ptr_t *fn_ptr,
                               const MV *center_mv) {
  const int_mv maxmv = pack_int_mv(x->mv_row_max, x->mv_col_max);
  const __m128i v_max_mv_w = _mm_set1_epi32(maxmv.as_int);
  const int_mv minmv = pack_int_mv(x->mv_row_min, x->mv_col_min);
  const __m128i v_min_mv_w = _mm_set1_epi32(minmv.as_int);

  const __m128i v_spb_d = _mm_set1_epi32(sad_per_bit);

  const __m128i v_joint_cost_0_d = _mm_set1_epi32(x->nmvjointsadcost[0]);
  const __m128i v_joint_cost_1_d = _mm_set1_epi32(x->nmvjointsadcost[1]);

  // search_param determines the length of the initial step and hence the number
  // of iterations.
  // 0 = initial step (MAX_FIRST_STEP) pel
  // 1 = (MAX_FIRST_STEP/2) pel,
  // 2 = (MAX_FIRST_STEP/4) pel...
  const       MV *ss_mv = &cfg->ss_mv[cfg->searches_per_step * search_param];
  const intptr_t *ss_os = &cfg->ss_os[cfg->searches_per_step * search_param];
  const int tot_steps = cfg->total_steps - search_param;

  const int_mv fcenter_mv = pack_int_mv(center_mv->row >> 3,
                                        center_mv->col >> 3);
  const __m128i vfcmv = _mm_set1_epi32(fcenter_mv.as_int);

  const int ref_row = clamp(ref_mv->row, minmv.as_mv.row, maxmv.as_mv.row);
  const int ref_col = clamp(ref_mv->col, minmv.as_mv.col, maxmv.as_mv.col);

  int_mv bmv = pack_int_mv(ref_row, ref_col);
  int_mv new_bmv = bmv;
  __m128i v_bmv_w = _mm_set1_epi32(bmv.as_int);

  const int what_stride = x->plane[0].src.stride;
  const int in_what_stride = x->e_mbd.plane[0].pre[0].stride;
  const uint8_t *const what = x->plane[0].src.buf;
  const uint8_t *const in_what = x->e_mbd.plane[0].pre[0].buf +
                                 ref_row * in_what_stride + ref_col;

  // Work out the start point for the search
  const uint8_t *best_address = in_what;
  const uint8_t *new_best_address = best_address;
#if ARCH_X86_64
  __m128i v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
  __m128i v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

  unsigned int best_sad;

  int i;
  int j;
  int step;

  // Check the prerequisite cost function properties that are easy to check
  // in an assert. See the function-level documentation for details on all
  // prerequisites.
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[2]);
  assert(x->nmvjointsadcost[1] == x->nmvjointsadcost[3]);

  // Check the starting position
  best_sad = fn_ptr->sdf(what, what_stride, in_what, in_what_stride);
  best_sad += mvsad_err_cost(x, bmv, &fcenter_mv.as_mv, sad_per_bit);

  *num00 = 0;

  for (i = 0, step = 0; step < tot_steps; step++) {
    for (j = 0; j < cfg->searches_per_step; j += 4, i += 4) {
      __m128i v_sad_d;
      __m128i v_cost_d;
      __m128i v_outside_d;
      __m128i v_inside_d;
      __m128i v_diff_mv_w;
#if ARCH_X86_64
      __m128i v_blocka[2];
#else
      __m128i v_blocka[1];
#endif

      // Compute the candidate motion vectors
      const __m128i v_ss_mv_w = _mm_loadu_si128((const __m128i*)&ss_mv[i]);
      const __m128i v_these_mv_w = _mm_add_epi16(v_bmv_w, v_ss_mv_w);
      // Clamp them to the search bounds
      __m128i v_these_mv_clamp_w = v_these_mv_w;
      v_these_mv_clamp_w = _mm_min_epi16(v_these_mv_clamp_w, v_max_mv_w);
      v_these_mv_clamp_w = _mm_max_epi16(v_these_mv_clamp_w, v_min_mv_w);
      // The ones that did not change are inside the search area
      v_inside_d = _mm_cmpeq_epi32(v_these_mv_clamp_w, v_these_mv_w);

      // If none of them are inside, then move on
      if (__likely__(_mm_test_all_zeros(v_inside_d, v_inside_d))) {
        continue;
      }

      // The inverse mask indicates which of the MVs are outside
      v_outside_d = _mm_xor_si128(v_inside_d, _mm_set1_epi8(0xff));
      // Shift right to keep the sign bit clear, we will use this later
      // to set the cost to the maximum value.
      v_outside_d = _mm_srli_epi32(v_outside_d, 1);

      // Compute the difference MV
      v_diff_mv_w = _mm_sub_epi16(v_these_mv_clamp_w, vfcmv);
      // We utilise the fact that the cost function is even, and use the
      // absolute difference. This allows us to use unsigned indexes later
      // and reduces cache pressure somewhat as only a half of the table
      // is ever referenced.
      v_diff_mv_w = _mm_abs_epi16(v_diff_mv_w);

      // Compute the SIMD pointer offsets.
      {
#if ARCH_X86_64  //  sizeof(intptr_t) == 8
        // Load the offsets
        __m128i v_bo10_q = _mm_loadu_si128((const __m128i*)&ss_os[i+0]);
        __m128i v_bo32_q = _mm_loadu_si128((const __m128i*)&ss_os[i+2]);
        // Set the ones falling outside to zero
        v_bo10_q = _mm_and_si128(v_bo10_q,
                                 _mm_cvtepi32_epi64(v_inside_d));
        v_bo32_q = _mm_and_si128(v_bo32_q,
                                 _mm_unpackhi_epi32(v_inside_d, v_inside_d));
        // Compute the candidate addresses
        v_blocka[0] = _mm_add_epi64(v_ba_q, v_bo10_q);
        v_blocka[1] = _mm_add_epi64(v_ba_q, v_bo32_q);
#else  // ARCH_X86 //  sizeof(intptr_t) == 4
        __m128i v_bo_d = _mm_loadu_si128((const __m128i*)&ss_os[i]);
        v_bo_d = _mm_and_si128(v_bo_d, v_inside_d);
        v_blocka[0] = _mm_add_epi32(v_ba_d, v_bo_d);
#endif
      }

      fn_ptr->sdx4df(what, what_stride,
                     (const uint8_t **)&v_blocka[0], in_what_stride,
                     (uint32_t*)&v_sad_d);

      // Look up the component cost of the residual motion vector
      {
        const int32_t row0 = _mm_extract_epi16(v_diff_mv_w, 0);
        const int32_t col0 = _mm_extract_epi16(v_diff_mv_w, 1);
        const int32_t row1 = _mm_extract_epi16(v_diff_mv_w, 2);
        const int32_t col1 = _mm_extract_epi16(v_diff_mv_w, 3);
        const int32_t row2 = _mm_extract_epi16(v_diff_mv_w, 4);
        const int32_t col2 = _mm_extract_epi16(v_diff_mv_w, 5);
        const int32_t row3 = _mm_extract_epi16(v_diff_mv_w, 6);
        const int32_t col3 = _mm_extract_epi16(v_diff_mv_w, 7);

        // Note: This is a use case for vpgather in AVX2
        const uint32_t cost0 = x->nmvsadcost[0][row0] + x->nmvsadcost[0][col0];
        const uint32_t cost1 = x->nmvsadcost[0][row1] + x->nmvsadcost[0][col1];
        const uint32_t cost2 = x->nmvsadcost[0][row2] + x->nmvsadcost[0][col2];
        const uint32_t cost3 = x->nmvsadcost[0][row3] + x->nmvsadcost[0][col3];

        __m128i v_cost_10_d, v_cost_32_d;

        v_cost_10_d = _mm_cvtsi32_si128(cost0);
        v_cost_10_d = _mm_insert_epi32(v_cost_10_d, cost1, 1);

        v_cost_32_d = _mm_cvtsi32_si128(cost2);
        v_cost_32_d = _mm_insert_epi32(v_cost_32_d, cost3, 1);

        v_cost_d = _mm_unpacklo_epi64(v_cost_10_d, v_cost_32_d);
      }

      // Now add in the joint cost
      {
        const __m128i v_sel_d = _mm_cmpeq_epi32(v_diff_mv_w,
                                                _mm_setzero_si128());
        const __m128i v_joint_cost_d = _mm_blendv_epi8(v_joint_cost_1_d,
                                                       v_joint_cost_0_d,
                                                       v_sel_d);
        v_cost_d = _mm_add_epi32(v_cost_d, v_joint_cost_d);
      }

      // Multiply by sad_per_bit
      v_cost_d = _mm_mullo_epi32(v_cost_d, v_spb_d);
      // ROUND_POWER_OF_TWO(v_cost_d, 8)
      v_cost_d = _mm_add_epi32(v_cost_d, _mm_set1_epi32(0x80));
      v_cost_d = _mm_srai_epi32(v_cost_d, 8);
      // Add the cost to the sad
      v_sad_d = _mm_add_epi32(v_sad_d, v_cost_d);

      // Make the motion vectors outside the search area have max cost
      // by or'ing in the comparison mask, this way the minimum search won't
      // pick them.
      v_sad_d = _mm_or_si128(v_sad_d, v_outside_d);

      // Find the minimum value and index horizontally in v_sad_d
      {
        // Try speculatively on 16 bits, so we can use the minpos intrinsic
        const __m128i v_sad_w = _mm_packus_epi32(v_sad_d, v_sad_d);
        const __m128i v_minp_w = _mm_minpos_epu16(v_sad_w);

        uint32_t local_best_sad = _mm_extract_epi16(v_minp_w, 0);
        uint32_t local_best_idx = _mm_extract_epi16(v_minp_w, 1);

        // If the local best value is not saturated, just use it, otherwise
        // find the horizontal minimum again the hard way on 32 bits.
        // This is executed rarely.
        if (__unlikely__(local_best_sad == 0xffff)) {
          __m128i v_loval_d, v_hival_d, v_loidx_d, v_hiidx_d, v_sel_d;

          v_loval_d = v_sad_d;
          v_loidx_d = _mm_set_epi32(3, 2, 1, 0);
          v_hival_d = _mm_srli_si128(v_loval_d, 8);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 8);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);
          v_hival_d = _mm_srli_si128(v_loval_d, 4);
          v_hiidx_d = _mm_srli_si128(v_loidx_d, 4);

          v_sel_d = _mm_cmplt_epi32(v_hival_d, v_loval_d);

          v_loval_d = _mm_blendv_epi8(v_loval_d, v_hival_d, v_sel_d);
          v_loidx_d = _mm_blendv_epi8(v_loidx_d, v_hiidx_d, v_sel_d);

          local_best_sad = _mm_extract_epi32(v_loval_d, 0);
          local_best_idx = _mm_extract_epi32(v_loidx_d, 0);
        }

        // Update the global minimum if the local minimum is smaller
        if (__likely__(local_best_sad < best_sad)) {
          new_bmv = ((const int_mv *)&v_these_mv_w)[local_best_idx];
          new_best_address = ((const uint8_t **)v_blocka)[local_best_idx];

          best_sad = local_best_sad;
        }
      }
    }

    bmv = new_bmv;
    best_address = new_best_address;

    v_bmv_w = _mm_set1_epi32(bmv.as_int);
#if ARCH_X86_64
    v_ba_q = _mm_set1_epi64x((intptr_t)best_address);
#else
    v_ba_d = _mm_set1_epi32((intptr_t)best_address);
#endif

    if (__unlikely__(best_address == in_what)) {
      (*num00)++;
    }
  }

  *best_mv = bmv.as_mv;
  return best_sad;
}
コード例 #30
0
ファイル: memcmp2.c プロジェクト: UTSASRG/DifferentTries
// @return true iff the two pages differ; false otherwise.
// @note Uses SSE3, so you must compile with -msse3.
bool pagesDifferent (const void * b1, const void * b2) {

  enum { PAGE_SIZE = 4096 };

  // Make a mask, initially all 1's.
  register __m128i mask = _mm_setzero_si128();
  mask = _mm_cmpeq_epi32(mask, mask); 


  __m128i * buf1 = (__m128i *) b1;
  __m128i * buf2 = (__m128i *) b2;

  // Some vectorizing pragamata here; not sure if gcc implements them.

#pragma vector always
  for (int i = 0; i < PAGE_SIZE / sizeof(__m128i); i += 8) {
#pragma ivdep
#pragma vector aligned

    register __m128i xmm1, xmm2;

    // Unrolled loop for speed: we load two 128-bit chunks,
    // and logically AND in their comparison.
    // If the mask gets any zero bits, the bytes differ.
    xmm1 = _mm_load_si128 (&buf1[i]);
    xmm2 = _mm_load_si128 (&buf2[i]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+1]);
    xmm2 = _mm_load_si128 (&buf2[i+1]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+2]);
    xmm2 = _mm_load_si128 (&buf2[i+2]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+3]);
    xmm2 = _mm_load_si128 (&buf2[i+3]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+4]);
    xmm2 = _mm_load_si128 (&buf2[i+4]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+5]);
    xmm2 = _mm_load_si128 (&buf2[i+5]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+6]);
    xmm2 = _mm_load_si128 (&buf2[i+6]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));
    xmm1 = _mm_load_si128 (&buf1[i+7]);
    xmm2 = _mm_load_si128 (&buf2[i+7]);
    mask = _mm_and_si128 (mask, _mm_cmpeq_epi32 (xmm1, xmm2));

    // Save the mask to see whether we have found a difference or not.
    unsigned long long buf[128 / sizeof(unsigned long long) / 8]  __attribute__((aligned(16)));
    _mm_store_si128 ((__m128i *) &buf, mask);
    
    // IMPORTANT: make sure long long = 64bits!
    enum { VERIFY_LONGLONG_64 = 1 / (sizeof(long long) == 8) };

    // Now check the result.
    // Both buf[0] and buf[1] should be all ones.
    if ((buf[0] != (unsigned long long) -1) ||
	(buf[1] != (unsigned long long) -1)) {
      return true;
    }
  }

  // No differences found.
  return false;
}