Пример #1
0
/* -----------------------------------
 *          replace_luma_yuy2
 * -----------------------------------
 */
static void replace_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height)
{
  int mod16_width = width / 16 * 16;
  __m128i luma_mask = _mm_set1_epi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m128i chroma_mask = _mm_set1_epi16(0xFF00);
#pragma warning(pop)

  for(int y = 0; y < height; y++) {
    for(int x = 0; x < mod16_width; x+=16) {
      __m128i s = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x));
      __m128i l = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x));

      __m128i s_chroma = _mm_and_si128(s, chroma_mask);
      __m128i l_luma = _mm_and_si128(l, luma_mask);

      __m128i result = _mm_or_si128(s_chroma, l_luma);

      _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result);
    }

    for (int x = mod16_width; x < width; x+=2) {
      src[x] = luma[x];
    }
    src += pitch;
    luma += luma_pitch;
  }
}
    SIMDValue SIMDInt8x16Operation::OpShiftRightByScalar(const SIMDValue& value, int8 count)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(value);
        X86SIMDValue x86tmp1;

        const _x86_SIMDValue X86_LOWBYTE_MASK  = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff };
        const _x86_SIMDValue X86_HIGHBYTE_MASK = { 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00 };

        if (count < 0 || count > 8)
        {
            count = 8;
        }

        x86tmp1.m128i_value = _mm_slli_epi16(tmpaValue.m128i_value, 8);
        x86tmp1.m128i_value = _mm_srai_epi16(x86tmp1.m128i_value, count + 8);

        x86tmp1.m128i_value = _mm_and_si128(x86tmp1.m128i_value, X86_LOWBYTE_MASK.m128i_value);

        tmpaValue.m128i_value = _mm_srai_epi16(tmpaValue.m128i_value, count);
        tmpaValue.m128i_value = _mm_and_si128(tmpaValue.m128i_value, X86_HIGHBYTE_MASK.m128i_value);

        x86Result.m128i_value = _mm_or_si128(tmpaValue.m128i_value, x86tmp1.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #3
0
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size)
{
	static const size_t stride = sizeof(__m128i)*4;
	static const u32 PSD = 64;
	
	static const __m128i lomask = _mm_set1_epi32(0x00FF00FF);
	static const __m128i round = _mm_set1_epi16(128);

	assert(source1 != NULL && source2 != NULL && dest != NULL);
	assert(size % stride == 0);
	assert(alpha >= 0.0 && alpha <= 1.0);

	const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1);
	const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2);
	__m128i* dest128 = reinterpret_cast<__m128i*>(dest);

	__m128i s = _mm_setzero_si128();
	__m128i d = _mm_setzero_si128();
	const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f));
	
	__m128i drb, dga, srb, sga;
	
	for (size_t k = 0, length = size/stride; k < length; ++k)
	{		
		_mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA);	
		_mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA);
		// TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/

		for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2)
		{
			// r = d + (s-d)*alpha/256
			s = _mm_load_si128(source128_1);	// AABBGGRR
			d = _mm_load_si128(source128_2);	// AABBGGRR

			srb = _mm_and_si128(lomask, s);		// 00BB00RR		// unpack
			sga = _mm_srli_epi16(s, 8);			// AA00GG00		// unpack
			
			drb = _mm_and_si128(lomask, d);		// 00BB00RR		// unpack
			dga = _mm_srli_epi16(d, 8);			// AA00GG00		// unpack

			srb = _mm_sub_epi16(srb, drb);		// BBBBRRRR		// sub
			srb = _mm_mullo_epi16(srb, a);		// BBBBRRRR		// mul
			srb = _mm_add_epi16(srb, round);
			
			sga = _mm_sub_epi16(sga, dga);		// AAAAGGGG		// sub
			sga = _mm_mullo_epi16(sga, a);		// AAAAGGGG		// mul
			sga = _mm_add_epi16(sga, round);

			srb = _mm_srli_epi16(srb, 8);		// 00BB00RR		// prepack and div
			sga = _mm_andnot_si128(lomask, sga);// AA00GG00		// prepack and div

			srb = _mm_or_si128(srb, sga);		// AABBGGRR		// pack

			srb = _mm_add_epi8(srb, d);			// AABBGGRR		// add		there is no overflow(R.N)

			_mm_store_si128(dest128, srb);
		}
	}
}
Пример #4
0
__m128i test_mm_or_si128(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_or_si128
  // DAG: or <2 x i64> %{{.*}}, %{{.*}}
  //
  // ASM-LABEL: test_mm_or_si128
  // ASM: orps
  return _mm_or_si128(A, B);
}
Пример #5
0
void dif_ssememcpy(void* _Dst, const void* _Src, size_t size)
{
	assert(IS_16BYTE_ALIGNMENT(_Src));

	float* dst = (float*)_Dst;
	float* src = (float*)_Src;
	__m128 xmm0, xmm1, xmm2, xmm3, xmm4;
	int loop_num = size >> 6;

	xmm0 = _mm_load_ps(src + 0);
	_mm_storeu_ps(dst + 0, xmm0);
	dst = (float*)((int)dst + _SHIFT);
	__m128i xmm0i = _mm_srli_si128(_mm_castps_si128(xmm0), _SHIFT); //xmm0 >> _SHIFT
	for (int i = 0; i < loop_num; i++) {
		xmm1 = _mm_load_ps(src + 4);
		xmm3 = _mm_load_ps(src + 8);
		xmm2 = xmm1;
		xmm4 = xmm3;
		__m128i xmm1i = _mm_slli_si128(_mm_castps_si128(xmm1), 16 - _SHIFT); //xmm1 << (16 - _SHIFT)
		__m128i xmm2i = _mm_srli_si128(_mm_castps_si128(xmm2), _SHIFT); //xmm2 >> _SHIFT
		__m128i xmm3i = _mm_slli_si128(_mm_castps_si128(xmm3), 16 - _SHIFT); //xmm3 << (16 - _SHIFT)
		__m128i xmm4i = _mm_srli_si128(_mm_castps_si128(xmm4), _SHIFT); //xmm4 >> _SHIFT
		xmm1i = _mm_or_si128(xmm1i, xmm0i);
		xmm3i = _mm_or_si128(xmm3i, xmm2i);
		_mm_store_ps(dst + 0, _mm_castsi128_ps(xmm1i));
		_mm_store_ps(dst + 4, _mm_castsi128_ps(xmm3i));

		xmm1 = _mm_load_ps(src + 12);
		xmm3 = _mm_load_ps(src + 16);
		xmm2 = xmm1;
		xmm0 = xmm3;
		xmm1i = _mm_slli_si128(_mm_castps_si128(xmm1), 16 - _SHIFT); //xmm1 << (16 - _SHIFT)
		xmm2i = _mm_srli_si128(_mm_castps_si128(xmm2), _SHIFT); //xmm2 >> _SHIFT
		xmm3i = _mm_slli_si128(_mm_castps_si128(xmm3), 16 - _SHIFT); //xmm3 << (16 - _SHIFT)
		xmm0i = _mm_srli_si128(_mm_castps_si128(xmm0), _SHIFT); //xmm0 >> _SHIFT
		xmm1i = _mm_or_si128(xmm1i, xmm4i);
		xmm3i = _mm_or_si128(xmm3i, xmm2i);
		_mm_store_ps(dst + 8, _mm_castsi128_ps(xmm1i));
		_mm_store_ps(dst + 12, _mm_castsi128_ps(xmm3i));

		dst += 16;
		src += 16;
	}

	memcpy((void*)((int)dst - _SHIFT), src, size & 0x3F);
}
static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) {
    __m128i sum = _mm_add_epi32(a, b);
    __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255));

    sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)),
                       _mm_andnot_si128(cmp, sum));
    return sum;
}
__m128i shift_left_sse1(__m128i vec, int shift_num) {
	if(shift_num == 8)
		return _mm_srli_si128(vec, 1);
	__m128i carryover = _mm_srli_si128(vec, 1);
	carryover = _mm_slli_epi64(carryover, 8 - (shift_num % 8));
	vec = _mm_srli_epi64(vec, shift_num % 8);
	return _mm_or_si128(vec, carryover);
}
Пример #8
0
 SIMD_INLINE void InterpolateX2(const __m128i * alpha, __m128i * buffer)
 {
     __m128i src = _mm_load_si128(buffer);
     __m128i a = _mm_load_si128(alpha);
     __m128i u = _mm_madd_epi16(_mm_and_si128(src, K16_00FF), a);
     __m128i v = _mm_madd_epi16(_mm_and_si128(_mm_srli_si128(src, 1), K16_00FF), a);
     _mm_store_si128(buffer, _mm_or_si128(u, _mm_slli_si128(v, 2)));
 }
Пример #9
0
  Bitboard operator |= (const Bitboard& rhs) {
#if defined (HAVE_SSE2) || defined (HAVE_SSE4)
    _mm_store_si128(&this->m_, _mm_or_si128(this->m_, rhs.m_));
#else
    this->p_[0] |= rhs.p(0);
    this->p_[1] |= rhs.p(1);
#endif
    return *this;
  }
Пример #10
0
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) {
    float *symbolsPtr = (float*) symbols;
  __m128i *resultPtr = (__m128i*) llr;
  __m128 symbol1, symbol2; 
  __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs;
  __m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM16/sqrt(10));
  __m128i result11, result12, result22, result21; 
  __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16);
  __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0);
  __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff);

  __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8);
  __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff);

  for (int i=0;i<nsymbols/4;i++) {
    symbol1   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol2   = _mm_load_ps(symbolsPtr); symbolsPtr+=4;
    symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v));
    symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v));
    symbol_i  = _mm_packs_epi32(symbol_i1, symbol_i2);
    
    symbol_abs  = _mm_abs_epi16(symbol_i);
    symbol_abs  = _mm_sub_epi16(symbol_abs, offset);
    
    result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1);  
    result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1);  

    result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2);  
    result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2);  

    _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++;
    _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++;
  }
  // Demodulate last symbols 
  for (int i=4*(nsymbols/4);i<nsymbols;i++) {
    short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i]));
    short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i]));
        
    llr[4*i+0] = -yre;
    llr[4*i+1] = -yim;
    llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);
    llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10);    
  }
}
Пример #11
0
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const
{
    __m128i** ptrSSE = (__m128i**) ptr;

    // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy)
    int cnt = 0;
    for( int n = FREAK_NB_PAIRS/128; n-- ; )
    {
        __m128i result128 = _mm_setzero_si128();
        for( int m = 128/16; m--; cnt += 16 )
        {
            __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i],
                                            pointsValue[descriptionPairs[cnt+1].i],
                                            pointsValue[descriptionPairs[cnt+2].i],
                                            pointsValue[descriptionPairs[cnt+3].i],
                                            pointsValue[descriptionPairs[cnt+4].i],
                                            pointsValue[descriptionPairs[cnt+5].i],
                                            pointsValue[descriptionPairs[cnt+6].i],
                                            pointsValue[descriptionPairs[cnt+7].i],
                                            pointsValue[descriptionPairs[cnt+8].i],
                                            pointsValue[descriptionPairs[cnt+9].i],
                                            pointsValue[descriptionPairs[cnt+10].i],
                                            pointsValue[descriptionPairs[cnt+11].i],
                                            pointsValue[descriptionPairs[cnt+12].i],
                                            pointsValue[descriptionPairs[cnt+13].i],
                                            pointsValue[descriptionPairs[cnt+14].i],
                                            pointsValue[descriptionPairs[cnt+15].i]);

            __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j],
                                            pointsValue[descriptionPairs[cnt+1].j],
                                            pointsValue[descriptionPairs[cnt+2].j],
                                            pointsValue[descriptionPairs[cnt+3].j],
                                            pointsValue[descriptionPairs[cnt+4].j],
                                            pointsValue[descriptionPairs[cnt+5].j],
                                            pointsValue[descriptionPairs[cnt+6].j],
                                            pointsValue[descriptionPairs[cnt+7].j],
                                            pointsValue[descriptionPairs[cnt+8].j],
                                            pointsValue[descriptionPairs[cnt+9].j],
                                            pointsValue[descriptionPairs[cnt+10].j],
                                            pointsValue[descriptionPairs[cnt+11].j],
                                            pointsValue[descriptionPairs[cnt+12].j],
                                            pointsValue[descriptionPairs[cnt+13].j],
                                            pointsValue[descriptionPairs[cnt+14].j],
                                            pointsValue[descriptionPairs[cnt+15].j]);

            __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers
            workReg = _mm_cmpeq_epi8(workReg, operand2);        // emulated "not less than" for 8-bit UNSIGNED integers

            workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full
            result128 = _mm_or_si128(result128, workReg);
        }
        (**ptrSSE) = result128;
        ++(*ptrSSE);
    }
    (*ptrSSE) -= 8;
}
    SIMDValue SIMDInt16x8Operation::OpOr(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);

        x86Result.m128i_value = _mm_or_si128(tmpaValue.m128i_value, tmpbValue.m128i_value); // a | b

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #13
0
INLINE __m128 shade(ColorInterp const&, const SWR_TRIANGLE_DESC &work, WideVector<ColorInterp::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE*, BYTE*, UINT*)
{
	// convert float to unorm
	__m128i vBlueI = vFloatToUnorm(get<2>(pAttrs));
	__m128i vGreenI = vFloatToUnorm(get<1>(pAttrs));
	__m128i vRedI = vFloatToUnorm(get<0>(pAttrs));
	__m128i vAlpha = _mm_set1_epi32(0xff000000);

	// pack
	__m128i vPixel = vBlueI;
	vGreenI = _mm_slli_epi32(vGreenI, 8);
	vRedI = _mm_slli_epi32(vRedI, 16);

	vPixel = _mm_or_si128(vPixel, vGreenI);
	vPixel = _mm_or_si128(vPixel, vRedI);
	vPixel = _mm_or_si128(vPixel, vAlpha);

	return _mm_castsi128_ps(vPixel);
}
Пример #14
0
SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) {
	    const __m128i* pin = (const __m128i*)(begin);
	    __m128i accumulator = _mm_loadu_si128(pin);
	    uint32_t k = 1;
	    for(; 4*k < SIMDBlockSize; ++k) {
	    	__m128i newvec = _mm_loadu_si128(pin+k);
	        accumulator = _mm_or_si128(accumulator,newvec);
	    }
	    return maxbitas32int(accumulator);
}
static inline void
desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags,
	struct rte_mbuf **rx_pkts)
{
	__m128i ptype0, ptype1, vtag0, vtag1;
	union {
		uint16_t e[4];
		uint64_t dword;
	} vol;

	/* mask everything except rss type */
	const __m128i rsstype_msk = _mm_set_epi16(
			0x0000, 0x0000, 0x0000, 0x0000,
			0x000F, 0x000F, 0x000F, 0x000F);

	/* map rss type to rss hash flag */
	const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0,
			0, 0, 0, PKT_RX_RSS_HASH,
			PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0,
			PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0);

	/* mask everything except vlan present bit */
	const __m128i vlan_msk = _mm_set_epi16(
			0x0000, 0x0000,
			0x0000, 0x0000,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP,
			IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP);
	/* map vlan present (0x8) to ol_flags */
	const __m128i vlan_map = _mm_set_epi8(
		0, 0, 0, 0,
		0, 0, 0, vlan_flags,
		0, 0, 0, 0,
		0, 0, 0, 0);

	ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]);
	ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]);
	vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]);
	vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]);

	ptype0 = _mm_unpacklo_epi32(ptype0, ptype1);
	ptype0 = _mm_and_si128(ptype0, rsstype_msk);
	ptype0 = _mm_shuffle_epi8(rss_flags, ptype0);

	vtag1 = _mm_unpacklo_epi32(vtag0, vtag1);
	vtag1 = _mm_and_si128(vtag1, vlan_msk);
	vtag1 = _mm_shuffle_epi8(vlan_map, vtag1);

	vtag1 = _mm_or_si128(ptype0, vtag1);
	vol.dword = _mm_cvtsi128_si64(vtag1);

	rx_pkts[0]->ol_flags = vol.e[0];
	rx_pkts[1]->ol_flags = vol.e[1];
	rx_pkts[2]->ol_flags = vol.e[2];
	rx_pkts[3]->ol_flags = vol.e[3];
}
Пример #16
0
inline void sum_offset( __m128i * X, __m128i * A, __m128i * B, __m128i * C, 
                        unsigned size_sse_ar, unsigned shift )
{
    for(unsigned i=0; i<size_sse_ar; ++i) 
    {
        __m128i tmp = _mm_and_si128(A[i],X[shift + i]);    
        A[i]=_mm_xor_si128(A[i],X[shift + i]);    
        C[i]=_mm_or_si128(C[i],_mm_and_si128(B[i],tmp));
        B[i]=_mm_xor_si128(B[i],tmp);
    }
}
Пример #17
0
      /*** simple union */
      TM_INLINE
      void unionwith(const BitFilter<BITS>& rhs)
      {
#ifdef STM_USE_SSE
          for (uint32_t i = 0; i < VEC_BLOCKS; ++i)
              vec_filter[i] = _mm_or_si128(vec_filter[i], rhs.vec_filter[i]);
#else
          for (uint32_t i = 0; i < WORD_BLOCKS; ++i)
              word_filter[i] |= rhs.word_filter[i];
#endif
      }
    SIMDValue SIMDInt16x8Operation::OpGreaterThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result, x86Result1, x86Result2;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result1.m128i_value = _mm_cmpgt_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a > b?
        x86Result2.m128i_value = _mm_cmpeq_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b?
        x86Result.m128i_value = _mm_or_si128(x86Result1.m128i_value, x86Result2.m128i_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #19
0
 SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m128i index, uint8_t * cols)
 {
     __m128i _cols = _mm_setzero_si128();
     for (size_t row = 0; row < size; ++row)
     {
         _cols = _mm_or_si128(_cols, _mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)mask), index));
         mask += stride;
     }
     _mm_storeu_si128((__m128i*)cols, _cols);
     return !_mm_testz_si128(_cols, K_INV_ZERO);
 }
Пример #20
0
static __m128i S(__m128i x, int i) {
  const __m128i a0 = _mm_shuffle_epi8(x, g_shuffles[i][0]);
  const __m128i b0 = _mm_shuffle_epi8(x, g_shuffles[i][1]);

  const __m128i a1 = _mm_min_epi8(a0, b0);
  const __m128i b1 = _mm_max_epi8(a0, b0);

  const __m128i a2 = _mm_shuffle_epi8(a1, g_shuffles[i][2]);
  const __m128i b2 = _mm_shuffle_epi8(b1, g_shuffles[i][3]);

  return _mm_or_si128(a2, b2);
}
Пример #21
0
__m64 _m_por(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_or_si128(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Пример #22
0
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src,
                                     int num_pixels, uint8_t* dst) {
  const __m128i mask_0xe0 = _mm_set1_epi8(0xe0);
  const __m128i mask_0xf8 = _mm_set1_epi8(0xf8);
  const __m128i mask_0x07 = _mm_set1_epi8(0x07);
  const __m128i* in = (const __m128i*)src;
  __m128i* out = (__m128i*)dst;
  while (num_pixels >= 8) {
    const __m128i bgra0 = _mm_loadu_si128(in++);     // bgra0|bgra1|bgra2|bgra3
    const __m128i bgra4 = _mm_loadu_si128(in++);     // bgra4|bgra5|bgra6|bgra7
    const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4);  // b0b4g0g4r0r4a0a4...
    const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4);  // b2b6g2g6r2r6a2a6...
    const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h);      // b0b2b4b6g0g2g4g6...
    const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h);      // b1b3b5b7g1g3g5g7...
    const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h);      // b0...b7 | g0...g7
    const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h);      // r0...r7 | a0...a7
    const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h);     // g0...g7 | a0...a7
    const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l);     // r0...r7 | b0...b7
    const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8);    // -r0..-r7|-b0..-b7
    const __m128i g_lo1 = _mm_srli_epi16(ga0, 5);
    const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07);  // g0-...g7-|xx (3b)
    const __m128i g_hi1 = _mm_slli_epi16(ga0, 3);
    const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0);  // -g0...-g7|xx (3b)
    const __m128i b0 = _mm_srli_si128(rb1, 8);              // -b0...-b7|0
    const __m128i rg1 = _mm_or_si128(rb1, g_lo2);           // gr0...gr7|xx
    const __m128i b1 = _mm_srli_epi16(b0, 3);
    const __m128i gb1 = _mm_or_si128(b1, g_hi2);            // bg0...bg7|xx
#if (WEBP_SWAP_16BIT_CSP == 1)
    const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1);     // rggb0...rggb7
#else
    const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1);     // bgrb0...bgrb7
#endif
    _mm_storeu_si128(out++, rgba);
    num_pixels -= 8;
  }
  // left-overs
  if (num_pixels > 0) {
    VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out);
  }
}
Пример #23
0
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) {
  int pa_minus_pb;
  const __m128i zero = _mm_setzero_si128();
  const __m128i A0 = _mm_cvtsi32_si128(a);
  const __m128i B0 = _mm_cvtsi32_si128(b);
  const __m128i C0 = _mm_cvtsi32_si128(c);
  const __m128i AC0 = _mm_subs_epu8(A0, C0);
  const __m128i CA0 = _mm_subs_epu8(C0, A0);
  const __m128i BC0 = _mm_subs_epu8(B0, C0);
  const __m128i CB0 = _mm_subs_epu8(C0, B0);
  const __m128i AC = _mm_or_si128(AC0, CA0);
  const __m128i BC = _mm_or_si128(BC0, CB0);
  const __m128i pa = _mm_unpacklo_epi8(AC, zero);  // |a - c|
  const __m128i pb = _mm_unpacklo_epi8(BC, zero);  // |b - c|
  const __m128i diff = _mm_sub_epi16(pb, pa);
  {
    int16_t out[8];
    _mm_storeu_si128((__m128i*)out, diff);
    pa_minus_pb = out[0] + out[1] + out[2] + out[3];
  }
  return (pa_minus_pb <= 0) ? a : b;
}
Пример #24
0
static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                          const __m128i& sa, const __m128i& da) {
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);

    // if (dc == da)
    __m128i cmp1 = _mm_cmpeq_epi32(dc, da);
    __m128i tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    __m128i rc1 = _mm_add_epi32(tmp1, tmp2);
    rc1 = _mm_add_epi32(rc1, tmp3);
    rc1 = clamp_div255round_SSE2(rc1);
    rc1 = _mm_and_si128(cmp1, rc1);

    // else if (0 == sc)
    __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128());
    __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa);
    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    rc2 = _mm_and_si128(cmp, rc2);

    // else
    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    __m128i tmp4 = _mm_sub_epi32(da, dc);
    tmp4 = Multiply32_SSE2(tmp4, sa);
    tmp4 = shim_mm_div_epi32(tmp4, sc);

    __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4));
    tmp5 = Multiply32_SSE2(sa, tmp5);
    __m128i rc3 = _mm_add_epi32(tmp5, tmp2);
    rc3 = _mm_add_epi32(rc3, tmp3);
    rc3 = clamp_div255round_SSE2(rc3);
    rc3 = _mm_andnot_si128(cmp3, rc3);

    __m128i rc = _mm_or_si128(rc1, rc2);
    rc = _mm_or_si128(rc, rc3);

    return rc;
}
Пример #25
0
/* maxbit over |length| integers with provided initial value */
uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in,
                uint32_t length) {
    __m128i newvec;
    __m128i oldvec;
    __m128i initoffset;
    __m128i accumulator;
    const __m128i *pin;
    uint32_t tmparray[4];
    uint32_t k = 1;
    uint32_t acc;

    assert(length > 0);

    pin = (const __m128i *)(in);
    initoffset = _mm_set1_epi32(initvalue);
    switch (length) {
      case 1:
        newvec = _mm_set1_epi32(in[0]);
        break;
      case 2:
        newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]);
        break;
      case 3:
        newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]);
        break;
      default:
        newvec = _mm_loadu_si128(pin);
        break;
    }
    accumulator = Delta(newvec, initoffset);
    oldvec = newvec;

    /* process 4 integers and build an accumulator */
    while (k * 4 + 4 <= length) {
        newvec = _mm_loadu_si128(pin + k);
        accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec));
        oldvec = newvec;
        k++;
    }

    /* extract the accumulator as an integer */
    _mm_storeu_si128((__m128i *)(tmparray), accumulator);
    acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3];

    /* now process the remaining integers */
    for (k *= 4; k < length; k++)
        acc |= in[k] - (k == 0 ? initvalue : in[k - 1]);

    /* return the number of bits */
    return bits(acc);
}
Пример #26
0
void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth,
                           const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch,
                           uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch)
{
#if defined(_M_ARM)
    // Ensure that this function is reported as not implemented for ARM builds because
    // the instructions below are not present for that architecture.
    UNIMPLEMENTED();
    return;
#else
    __m128i brMask = _mm_set1_epi32(0x00ff00ff);

    for (size_t z = 0; z < depth; z++)
    {
        for (size_t y = 0; y < height; y++)
        {
            const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch);
            uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch);

            size_t x = 0;

            // Make output writes aligned
            for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++)
            {
                uint32_t rgba = source[x];
                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
            }

            for (; x + 3 < width; x += 4)
            {
                __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x]));
                // Mask out g and a, which don't change
                __m128i gaComponents = _mm_andnot_si128(brMask, sourceData);
                // Mask out b and r
                __m128i brComponents = _mm_and_si128(sourceData, brMask);
                // Swap b and r
                __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1));
                __m128i result = _mm_or_si128(gaComponents, brSwapped);
                _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result);
            }

            // Perform leftover writes
            for (; x < width; x++)
            {
                uint32_t rgba = source[x];
                dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00);
            }
        }
    }
#endif
}
Пример #27
0
static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                           const __m128i& sa, const __m128i& da) {
    __m128i diff = _mm_sub_epi32(sa, sc);
    __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da);
    __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa);

    // if (0 == dc)
    __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128());
    __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida));

    // else if (0 == diff)
    __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128());
    __m128i cmp = _mm_andnot_si128(cmp1, cmp2);
    __m128i tmp1 = _mm_mullo_epi16(sa, da);
    __m128i tmp2 = _mm_mullo_epi16(sc, ida);
    __m128i tmp3 = _mm_mullo_epi16(dc, isa);
    __m128i rc2 = _mm_add_epi32(tmp1, tmp2);
    rc2 = _mm_add_epi32(rc2, tmp3);
    rc2 = clamp_div255round_SSE2(rc2);
    rc2 = _mm_and_si128(cmp, rc2);

    // else
    __m128i cmp3 = _mm_or_si128(cmp1, cmp2);
    __m128i value = _mm_mullo_epi16(dc, sa);
    diff = shim_mm_div_epi32(value, diff);

    __m128i tmp4 = SkMin32_SSE2(da, diff);
    tmp4 = Multiply32_SSE2(sa, tmp4);
    __m128i rc3 = _mm_add_epi32(tmp4, tmp2);
    rc3 = _mm_add_epi32(rc3, tmp3);
    rc3 = clamp_div255round_SSE2(rc3);
    rc3 = _mm_andnot_si128(cmp3, rc3);

    __m128i rc = _mm_or_si128(rc1, rc2);
    rc = _mm_or_si128(rc, rc3);

    return rc;
}
Пример #28
0
static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m,
                                              uint32_t* argb_data,
                                              int num_pixels) {
  const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_);       // multipliers
  const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_);
  const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_);

  int i;

  for (i = 0; i + 4 <= num_pixels; i += 4) {
    const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]);
    const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00);  // masks
    const __m128i red_mask = _mm_set1_epi32(0x00ff0000);
    const __m128i green_mask = _mm_set1_epi32(0x0000ff00);
    const __m128i lower_8bit_mask  = _mm_set1_epi32(0x000000ff);
    const __m128i ag = _mm_and_si128(in, alpha_green_mask);      // alpha, green
    const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16);
    const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8);
    const __m128i b = in;

    const __m128i r_delta = ColorTransformDelta(g_to_r, g);      // red
    const __m128i r_new =
        _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask);
    const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16);

    const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g);    // blue
    const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new);
    const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2);
    const __m128i b_new =
        _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask);

    const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new);
    _mm_storeu_si128((__m128i*)&argb_data[i], out);
  }

  // Fall-back to C-version for left-overs.
  VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i);
}
Пример #29
0
/*
=====================
R_CopyDecalSurface
=====================
*/
static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes,
									const decal_t * decal, const float fadeColor[4] ) {
	assert_16_byte_aligned( &verts[numVerts] );
	assert_16_byte_aligned( &indexes[numIndexes] );
	assert_16_byte_aligned( decal->indexes );
	assert_16_byte_aligned( decal->verts );
	assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 );
	assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 );
	assert_16_byte_aligned( fadeColor );


	const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 );
	const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts );
	const __m128 vector_fade_color = _mm_load_ps( fadeColor );
	const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 );

	// copy vertices and apply depth/time based fading
	assert_offsetof( idDrawVert, color, 6 * 4 );
	for ( int i = 0; i < decal->numVerts; i++ ) {
		const idDrawVert &srcVert = decal->verts[i];
		idDrawVert &dstVert = verts[numVerts + i];

		__m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert +  0 ) );
		__m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) );
		__m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 );

		__m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color );
		__m128i colorInt = _mm_cvtps_epi32( timeDepthFade );
		__m128i colorShort = _mm_packs_epi32( colorInt, colorInt );
		__m128i colorByte = _mm_packus_epi16( colorShort, colorShort );
		v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) );

		_mm_stream_si128( (__m128i *)( (byte *)&dstVert +  0 ), v0 );
		_mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 );
	}

	// copy indexes
	assert( ( decal->numIndexes & 7 ) == 0 );
	assert( sizeof( triIndex_t ) == 2 );
	for ( int i = 0; i < decal->numIndexes; i += 8 ) {
		__m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] );

		vi = _mm_add_epi16( vi, vector_short_num_verts );

		_mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi );
	}

	_mm_sfence();

}
Пример #30
0
static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc,
                                       const __m128i& sa, const __m128i& da) {
    __m128i sd = _mm_mullo_epi16(sc, da);
    __m128i ds = _mm_mullo_epi16(dc, sa);

    __m128i cmp = _mm_cmplt_epi32(sd, ds);

    __m128i tmp = _mm_add_epi32(sc, dc);
    __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds));
    __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd));
    __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1),
                               _mm_andnot_si128(cmp, ret2));
    return ret;
}