/* ----------------------------------- * replace_luma_yuy2 * ----------------------------------- */ static void replace_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height) { int mod16_width = width / 16 * 16; __m128i luma_mask = _mm_set1_epi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m128i chroma_mask = _mm_set1_epi16(0xFF00); #pragma warning(pop) for(int y = 0; y < height; y++) { for(int x = 0; x < mod16_width; x+=16) { __m128i s = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); __m128i l = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); __m128i s_chroma = _mm_and_si128(s, chroma_mask); __m128i l_luma = _mm_and_si128(l, luma_mask); __m128i result = _mm_or_si128(s_chroma, l_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = mod16_width; x < width; x+=2) { src[x] = luma[x]; } src += pitch; luma += luma_pitch; } }
SIMDValue SIMDInt8x16Operation::OpShiftRightByScalar(const SIMDValue& value, int8 count) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue x86tmp1; const _x86_SIMDValue X86_LOWBYTE_MASK = { 0x00ff00ff, 0x00ff00ff, 0x00ff00ff, 0x00ff00ff }; const _x86_SIMDValue X86_HIGHBYTE_MASK = { 0xff00ff00, 0xff00ff00, 0xff00ff00, 0xff00ff00 }; if (count < 0 || count > 8) { count = 8; } x86tmp1.m128i_value = _mm_slli_epi16(tmpaValue.m128i_value, 8); x86tmp1.m128i_value = _mm_srai_epi16(x86tmp1.m128i_value, count + 8); x86tmp1.m128i_value = _mm_and_si128(x86tmp1.m128i_value, X86_LOWBYTE_MASK.m128i_value); tmpaValue.m128i_value = _mm_srai_epi16(tmpaValue.m128i_value, count); tmpaValue.m128i_value = _mm_and_si128(tmpaValue.m128i_value, X86_HIGHBYTE_MASK.m128i_value); x86Result.m128i_value = _mm_or_si128(tmpaValue.m128i_value, x86tmp1.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/stride; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_store_si128(dest128, srb); } } }
__m128i test_mm_or_si128(__m128i A, __m128i B) { // DAG-LABEL: test_mm_or_si128 // DAG: or <2 x i64> %{{.*}}, %{{.*}} // // ASM-LABEL: test_mm_or_si128 // ASM: orps return _mm_or_si128(A, B); }
void dif_ssememcpy(void* _Dst, const void* _Src, size_t size) { assert(IS_16BYTE_ALIGNMENT(_Src)); float* dst = (float*)_Dst; float* src = (float*)_Src; __m128 xmm0, xmm1, xmm2, xmm3, xmm4; int loop_num = size >> 6; xmm0 = _mm_load_ps(src + 0); _mm_storeu_ps(dst + 0, xmm0); dst = (float*)((int)dst + _SHIFT); __m128i xmm0i = _mm_srli_si128(_mm_castps_si128(xmm0), _SHIFT); //xmm0 >> _SHIFT for (int i = 0; i < loop_num; i++) { xmm1 = _mm_load_ps(src + 4); xmm3 = _mm_load_ps(src + 8); xmm2 = xmm1; xmm4 = xmm3; __m128i xmm1i = _mm_slli_si128(_mm_castps_si128(xmm1), 16 - _SHIFT); //xmm1 << (16 - _SHIFT) __m128i xmm2i = _mm_srli_si128(_mm_castps_si128(xmm2), _SHIFT); //xmm2 >> _SHIFT __m128i xmm3i = _mm_slli_si128(_mm_castps_si128(xmm3), 16 - _SHIFT); //xmm3 << (16 - _SHIFT) __m128i xmm4i = _mm_srli_si128(_mm_castps_si128(xmm4), _SHIFT); //xmm4 >> _SHIFT xmm1i = _mm_or_si128(xmm1i, xmm0i); xmm3i = _mm_or_si128(xmm3i, xmm2i); _mm_store_ps(dst + 0, _mm_castsi128_ps(xmm1i)); _mm_store_ps(dst + 4, _mm_castsi128_ps(xmm3i)); xmm1 = _mm_load_ps(src + 12); xmm3 = _mm_load_ps(src + 16); xmm2 = xmm1; xmm0 = xmm3; xmm1i = _mm_slli_si128(_mm_castps_si128(xmm1), 16 - _SHIFT); //xmm1 << (16 - _SHIFT) xmm2i = _mm_srli_si128(_mm_castps_si128(xmm2), _SHIFT); //xmm2 >> _SHIFT xmm3i = _mm_slli_si128(_mm_castps_si128(xmm3), 16 - _SHIFT); //xmm3 << (16 - _SHIFT) xmm0i = _mm_srli_si128(_mm_castps_si128(xmm0), _SHIFT); //xmm0 >> _SHIFT xmm1i = _mm_or_si128(xmm1i, xmm4i); xmm3i = _mm_or_si128(xmm3i, xmm2i); _mm_store_ps(dst + 8, _mm_castsi128_ps(xmm1i)); _mm_store_ps(dst + 12, _mm_castsi128_ps(xmm3i)); dst += 16; src += 16; } memcpy((void*)((int)dst - _SHIFT), src, size & 0x3F); }
static inline __m128i saturated_add_SSE2(const __m128i& a, const __m128i& b) { __m128i sum = _mm_add_epi32(a, b); __m128i cmp = _mm_cmpgt_epi32(sum, _mm_set1_epi32(255)); sum = _mm_or_si128(_mm_and_si128(cmp, _mm_set1_epi32(255)), _mm_andnot_si128(cmp, sum)); return sum; }
__m128i shift_left_sse1(__m128i vec, int shift_num) { if(shift_num == 8) return _mm_srli_si128(vec, 1); __m128i carryover = _mm_srli_si128(vec, 1); carryover = _mm_slli_epi64(carryover, 8 - (shift_num % 8)); vec = _mm_srli_epi64(vec, shift_num % 8); return _mm_or_si128(vec, carryover); }
SIMD_INLINE void InterpolateX2(const __m128i * alpha, __m128i * buffer) { __m128i src = _mm_load_si128(buffer); __m128i a = _mm_load_si128(alpha); __m128i u = _mm_madd_epi16(_mm_and_si128(src, K16_00FF), a); __m128i v = _mm_madd_epi16(_mm_and_si128(_mm_srli_si128(src, 1), K16_00FF), a); _mm_store_si128(buffer, _mm_or_si128(u, _mm_slli_si128(v, 2))); }
Bitboard operator |= (const Bitboard& rhs) { #if defined (HAVE_SSE2) || defined (HAVE_SSE4) _mm_store_si128(&this->m_, _mm_or_si128(this->m_, rhs.m_)); #else this->p_[0] |= rhs.p(0); this->p_[1] |= rhs.p(1); #endif return *this; }
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2; __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs; __m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM16/sqrt(10)); __m128i result11, result12, result22, result21; __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff); for (int i=0;i<nsymbols/4;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_abs = _mm_abs_epi16(symbol_i); symbol_abs = _mm_sub_epi16(symbol_abs, offset); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++; } // Demodulate last symbols for (int i=4*(nsymbols/4);i<nsymbols;i++) { short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i])); short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i])); llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); } }
void FREAK::extractDescriptor(uchar *pointsValue, void ** ptr) const { __m128i** ptrSSE = (__m128i**) ptr; // note that comparisons order is modified in each block (but first 128 comparisons remain globally the same-->does not affect the 128,384 bits segmanted matching strategy) int cnt = 0; for( int n = FREAK_NB_PAIRS/128; n-- ; ) { __m128i result128 = _mm_setzero_si128(); for( int m = 128/16; m--; cnt += 16 ) { __m128i operand1 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].i], pointsValue[descriptionPairs[cnt+1].i], pointsValue[descriptionPairs[cnt+2].i], pointsValue[descriptionPairs[cnt+3].i], pointsValue[descriptionPairs[cnt+4].i], pointsValue[descriptionPairs[cnt+5].i], pointsValue[descriptionPairs[cnt+6].i], pointsValue[descriptionPairs[cnt+7].i], pointsValue[descriptionPairs[cnt+8].i], pointsValue[descriptionPairs[cnt+9].i], pointsValue[descriptionPairs[cnt+10].i], pointsValue[descriptionPairs[cnt+11].i], pointsValue[descriptionPairs[cnt+12].i], pointsValue[descriptionPairs[cnt+13].i], pointsValue[descriptionPairs[cnt+14].i], pointsValue[descriptionPairs[cnt+15].i]); __m128i operand2 = _mm_set_epi8(pointsValue[descriptionPairs[cnt+0].j], pointsValue[descriptionPairs[cnt+1].j], pointsValue[descriptionPairs[cnt+2].j], pointsValue[descriptionPairs[cnt+3].j], pointsValue[descriptionPairs[cnt+4].j], pointsValue[descriptionPairs[cnt+5].j], pointsValue[descriptionPairs[cnt+6].j], pointsValue[descriptionPairs[cnt+7].j], pointsValue[descriptionPairs[cnt+8].j], pointsValue[descriptionPairs[cnt+9].j], pointsValue[descriptionPairs[cnt+10].j], pointsValue[descriptionPairs[cnt+11].j], pointsValue[descriptionPairs[cnt+12].j], pointsValue[descriptionPairs[cnt+13].j], pointsValue[descriptionPairs[cnt+14].j], pointsValue[descriptionPairs[cnt+15].j]); __m128i workReg = _mm_min_epu8(operand1, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_cmpeq_epi8(workReg, operand2); // emulated "not less than" for 8-bit UNSIGNED integers workReg = _mm_and_si128(_mm_set1_epi16(short(0x8080 >> m)), workReg); // merge the last 16 bits with the 128bits std::vector until full result128 = _mm_or_si128(result128, workReg); } (**ptrSSE) = result128; ++(*ptrSSE); } (*ptrSSE) -= 8; }
SIMDValue SIMDInt16x8Operation::OpOr(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_or_si128(tmpaValue.m128i_value, tmpbValue.m128i_value); // a | b return X86SIMDValue::ToSIMDValue(x86Result); }
INLINE __m128 shade(ColorInterp const&, const SWR_TRIANGLE_DESC &work, WideVector<ColorInterp::NUM_ATTRIBUTES, __m128> const& pAttrs, BYTE*, BYTE*, UINT*) { // convert float to unorm __m128i vBlueI = vFloatToUnorm(get<2>(pAttrs)); __m128i vGreenI = vFloatToUnorm(get<1>(pAttrs)); __m128i vRedI = vFloatToUnorm(get<0>(pAttrs)); __m128i vAlpha = _mm_set1_epi32(0xff000000); // pack __m128i vPixel = vBlueI; vGreenI = _mm_slli_epi32(vGreenI, 8); vRedI = _mm_slli_epi32(vRedI, 16); vPixel = _mm_or_si128(vPixel, vGreenI); vPixel = _mm_or_si128(vPixel, vRedI); vPixel = _mm_or_si128(vPixel, vAlpha); return _mm_castsi128_ps(vPixel); }
SIMDCOMP_PURE uint32_t maxbits(const uint32_t * begin) { const __m128i* pin = (const __m128i*)(begin); __m128i accumulator = _mm_loadu_si128(pin); uint32_t k = 1; for(; 4*k < SIMDBlockSize; ++k) { __m128i newvec = _mm_loadu_si128(pin+k); accumulator = _mm_or_si128(accumulator,newvec); } return maxbitas32int(accumulator); }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present bit */ const __m128i vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8) to ol_flags */ const __m128i vlan_map = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, vlan_flags, 0, 0, 0, 0, 0, 0, 0, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_msk); vtag1 = _mm_shuffle_epi8(vlan_map, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
inline void sum_offset( __m128i * X, __m128i * A, __m128i * B, __m128i * C, unsigned size_sse_ar, unsigned shift ) { for(unsigned i=0; i<size_sse_ar; ++i) { __m128i tmp = _mm_and_si128(A[i],X[shift + i]); A[i]=_mm_xor_si128(A[i],X[shift + i]); C[i]=_mm_or_si128(C[i],_mm_and_si128(B[i],tmp)); B[i]=_mm_xor_si128(B[i],tmp); } }
/*** simple union */ TM_INLINE void unionwith(const BitFilter<BITS>& rhs) { #ifdef STM_USE_SSE for (uint32_t i = 0; i < VEC_BLOCKS; ++i) vec_filter[i] = _mm_or_si128(vec_filter[i], rhs.vec_filter[i]); #else for (uint32_t i = 0; i < WORD_BLOCKS; ++i) word_filter[i] |= rhs.word_filter[i]; #endif }
SIMDValue SIMDInt16x8Operation::OpGreaterThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result, x86Result1, x86Result2; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result1.m128i_value = _mm_cmpgt_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a > b? x86Result2.m128i_value = _mm_cmpeq_epi16(tmpaValue.m128i_value, tmpbValue.m128i_value); // compare a == b? x86Result.m128i_value = _mm_or_si128(x86Result1.m128i_value, x86Result2.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
SIMD_INLINE bool ColsHasIndex(const uint8_t * mask, size_t stride, size_t size, __m128i index, uint8_t * cols) { __m128i _cols = _mm_setzero_si128(); for (size_t row = 0; row < size; ++row) { _cols = _mm_or_si128(_cols, _mm_cmpeq_epi8(_mm_loadu_si128((__m128i*)mask), index)); mask += stride; } _mm_storeu_si128((__m128i*)cols, _cols); return !_mm_testz_si128(_cols, K_INV_ZERO); }
static __m128i S(__m128i x, int i) { const __m128i a0 = _mm_shuffle_epi8(x, g_shuffles[i][0]); const __m128i b0 = _mm_shuffle_epi8(x, g_shuffles[i][1]); const __m128i a1 = _mm_min_epi8(a0, b0); const __m128i b1 = _mm_max_epi8(a0, b0); const __m128i a2 = _mm_shuffle_epi8(a1, g_shuffles[i][2]); const __m128i b2 = _mm_shuffle_epi8(b1, g_shuffles[i][3]); return _mm_or_si128(a2, b2); }
__m64 _m_por(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_or_si128(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
static void ConvertBGRAToRGB565_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0xe0 = _mm_set1_epi8(0xe0); const __m128i mask_0xf8 = _mm_set1_epi8(0xf8); const __m128i mask_0x07 = _mm_set1_epi8(0x07); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i rb1 = _mm_and_si128(rb0, mask_0xf8); // -r0..-r7|-b0..-b7 const __m128i g_lo1 = _mm_srli_epi16(ga0, 5); const __m128i g_lo2 = _mm_and_si128(g_lo1, mask_0x07); // g0-...g7-|xx (3b) const __m128i g_hi1 = _mm_slli_epi16(ga0, 3); const __m128i g_hi2 = _mm_and_si128(g_hi1, mask_0xe0); // -g0...-g7|xx (3b) const __m128i b0 = _mm_srli_si128(rb1, 8); // -b0...-b7|0 const __m128i rg1 = _mm_or_si128(rb1, g_lo2); // gr0...gr7|xx const __m128i b1 = _mm_srli_epi16(b0, 3); const __m128i gb1 = _mm_or_si128(b1, g_hi2); // bg0...bg7|xx #if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(gb1, rg1); // rggb0...rggb7 #else const __m128i rgba = _mm_unpacklo_epi8(rg1, gb1); // bgrb0...bgrb7 #endif _mm_storeu_si128(out++, rgba); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGB565_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
static WEBP_INLINE uint32_t Select_SSE2(uint32_t a, uint32_t b, uint32_t c) { int pa_minus_pb; const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_cvtsi32_si128(a); const __m128i B0 = _mm_cvtsi32_si128(b); const __m128i C0 = _mm_cvtsi32_si128(c); const __m128i AC0 = _mm_subs_epu8(A0, C0); const __m128i CA0 = _mm_subs_epu8(C0, A0); const __m128i BC0 = _mm_subs_epu8(B0, C0); const __m128i CB0 = _mm_subs_epu8(C0, B0); const __m128i AC = _mm_or_si128(AC0, CA0); const __m128i BC = _mm_or_si128(BC0, CB0); const __m128i pa = _mm_unpacklo_epi8(AC, zero); // |a - c| const __m128i pb = _mm_unpacklo_epi8(BC, zero); // |b - c| const __m128i diff = _mm_sub_epi16(pb, pa); { int16_t out[8]; _mm_storeu_si128((__m128i*)out, diff); pa_minus_pb = out[0] + out[1] + out[2] + out[3]; } return (pa_minus_pb <= 0) ? a : b; }
static inline __m128i colorburn_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); // if (dc == da) __m128i cmp1 = _mm_cmpeq_epi32(dc, da); __m128i tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = _mm_mullo_epi16(sc, ida); __m128i tmp3 = _mm_mullo_epi16(dc, isa); __m128i rc1 = _mm_add_epi32(tmp1, tmp2); rc1 = _mm_add_epi32(rc1, tmp3); rc1 = clamp_div255round_SSE2(rc1); rc1 = _mm_and_si128(cmp1, rc1); // else if (0 == sc) __m128i cmp2 = _mm_cmpeq_epi32(sc, _mm_setzero_si128()); __m128i rc2 = SkAlphaMulAlpha_SSE2(dc, isa); __m128i cmp = _mm_andnot_si128(cmp1, cmp2); rc2 = _mm_and_si128(cmp, rc2); // else __m128i cmp3 = _mm_or_si128(cmp1, cmp2); __m128i tmp4 = _mm_sub_epi32(da, dc); tmp4 = Multiply32_SSE2(tmp4, sa); tmp4 = shim_mm_div_epi32(tmp4, sc); __m128i tmp5 = _mm_sub_epi32(da, SkMin32_SSE2(da, tmp4)); tmp5 = Multiply32_SSE2(sa, tmp5); __m128i rc3 = _mm_add_epi32(tmp5, tmp2); rc3 = _mm_add_epi32(rc3, tmp3); rc3 = clamp_div255round_SSE2(rc3); rc3 = _mm_andnot_si128(cmp3, rc3); __m128i rc = _mm_or_si128(rc1, rc2); rc = _mm_or_si128(rc, rc3); return rc; }
/* maxbit over |length| integers with provided initial value */ uint32_t simdmaxbitsd1_length(uint32_t initvalue, const uint32_t * in, uint32_t length) { __m128i newvec; __m128i oldvec; __m128i initoffset; __m128i accumulator; const __m128i *pin; uint32_t tmparray[4]; uint32_t k = 1; uint32_t acc; assert(length > 0); pin = (const __m128i *)(in); initoffset = _mm_set1_epi32(initvalue); switch (length) { case 1: newvec = _mm_set1_epi32(in[0]); break; case 2: newvec = _mm_setr_epi32(in[0], in[1], in[1], in[1]); break; case 3: newvec = _mm_setr_epi32(in[0], in[1], in[2], in[2]); break; default: newvec = _mm_loadu_si128(pin); break; } accumulator = Delta(newvec, initoffset); oldvec = newvec; /* process 4 integers and build an accumulator */ while (k * 4 + 4 <= length) { newvec = _mm_loadu_si128(pin + k); accumulator = _mm_or_si128(accumulator, Delta(newvec, oldvec)); oldvec = newvec; k++; } /* extract the accumulator as an integer */ _mm_storeu_si128((__m128i *)(tmparray), accumulator); acc = tmparray[0] | tmparray[1] | tmparray[2] | tmparray[3]; /* now process the remaining integers */ for (k *= 4; k < length; k++) acc |= in[k] - (k == 0 ? initvalue : in[k - 1]); /* return the number of bits */ return bits(acc); }
void LoadRGBA8ToBGRA8_SSE2(size_t width, size_t height, size_t depth, const uint8_t *input, size_t inputRowPitch, size_t inputDepthPitch, uint8_t *output, size_t outputRowPitch, size_t outputDepthPitch) { #if defined(_M_ARM) // Ensure that this function is reported as not implemented for ARM builds because // the instructions below are not present for that architecture. UNIMPLEMENTED(); return; #else __m128i brMask = _mm_set1_epi32(0x00ff00ff); for (size_t z = 0; z < depth; z++) { for (size_t y = 0; y < height; y++) { const uint32_t *source = OffsetDataPointer<uint32_t>(input, y, z, inputRowPitch, inputDepthPitch); uint32_t *dest = OffsetDataPointer<uint32_t>(output, y, z, outputRowPitch, outputDepthPitch); size_t x = 0; // Make output writes aligned for (; ((reinterpret_cast<intptr_t>(&dest[x]) & 15) != 0) && x < width; x++) { uint32_t rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } for (; x + 3 < width; x += 4) { __m128i sourceData = _mm_loadu_si128(reinterpret_cast<const __m128i*>(&source[x])); // Mask out g and a, which don't change __m128i gaComponents = _mm_andnot_si128(brMask, sourceData); // Mask out b and r __m128i brComponents = _mm_and_si128(sourceData, brMask); // Swap b and r __m128i brSwapped = _mm_shufflehi_epi16(_mm_shufflelo_epi16(brComponents, _MM_SHUFFLE(2, 3, 0, 1)), _MM_SHUFFLE(2, 3, 0, 1)); __m128i result = _mm_or_si128(gaComponents, brSwapped); _mm_store_si128(reinterpret_cast<__m128i*>(&dest[x]), result); } // Perform leftover writes for (; x < width; x++) { uint32_t rgba = source[x]; dest[x] = (_rotl(rgba, 16) & 0x00ff00ff) | (rgba & 0xff00ff00); } } } #endif }
static inline __m128i colordodge_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i diff = _mm_sub_epi32(sa, sc); __m128i ida = _mm_sub_epi32(_mm_set1_epi32(255), da); __m128i isa = _mm_sub_epi32(_mm_set1_epi32(255), sa); // if (0 == dc) __m128i cmp1 = _mm_cmpeq_epi32(dc, _mm_setzero_si128()); __m128i rc1 = _mm_and_si128(cmp1, SkAlphaMulAlpha_SSE2(sc, ida)); // else if (0 == diff) __m128i cmp2 = _mm_cmpeq_epi32(diff, _mm_setzero_si128()); __m128i cmp = _mm_andnot_si128(cmp1, cmp2); __m128i tmp1 = _mm_mullo_epi16(sa, da); __m128i tmp2 = _mm_mullo_epi16(sc, ida); __m128i tmp3 = _mm_mullo_epi16(dc, isa); __m128i rc2 = _mm_add_epi32(tmp1, tmp2); rc2 = _mm_add_epi32(rc2, tmp3); rc2 = clamp_div255round_SSE2(rc2); rc2 = _mm_and_si128(cmp, rc2); // else __m128i cmp3 = _mm_or_si128(cmp1, cmp2); __m128i value = _mm_mullo_epi16(dc, sa); diff = shim_mm_div_epi32(value, diff); __m128i tmp4 = SkMin32_SSE2(da, diff); tmp4 = Multiply32_SSE2(sa, tmp4); __m128i rc3 = _mm_add_epi32(tmp4, tmp2); rc3 = _mm_add_epi32(rc3, tmp3); rc3 = clamp_div255round_SSE2(rc3); rc3 = _mm_andnot_si128(cmp3, rc3); __m128i rc = _mm_or_si128(rc1, rc2); rc = _mm_or_si128(rc, rc3); return rc; }
static WEBP_INLINE void TransformColorInverse(const VP8LMultipliers* const m, uint32_t* argb_data, int num_pixels) { const __m128i g_to_r = _mm_set1_epi32(m->green_to_red_); // multipliers const __m128i g_to_b = _mm_set1_epi32(m->green_to_blue_); const __m128i r_to_b = _mm_set1_epi32(m->red_to_blue_); int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((__m128i*)&argb_data[i]); const __m128i alpha_green_mask = _mm_set1_epi32(0xff00ff00); // masks const __m128i red_mask = _mm_set1_epi32(0x00ff0000); const __m128i green_mask = _mm_set1_epi32(0x0000ff00); const __m128i lower_8bit_mask = _mm_set1_epi32(0x000000ff); const __m128i ag = _mm_and_si128(in, alpha_green_mask); // alpha, green const __m128i r = _mm_srli_epi32(_mm_and_si128(in, red_mask), 16); const __m128i g = _mm_srli_epi32(_mm_and_si128(in, green_mask), 8); const __m128i b = in; const __m128i r_delta = ColorTransformDelta(g_to_r, g); // red const __m128i r_new = _mm_and_si128(_mm_add_epi32(r, r_delta), lower_8bit_mask); const __m128i r_new_shifted = _mm_slli_epi32(r_new, 16); const __m128i b_delta_1 = ColorTransformDelta(g_to_b, g); // blue const __m128i b_delta_2 = ColorTransformDelta(r_to_b, r_new); const __m128i b_delta = _mm_add_epi32(b_delta_1, b_delta_2); const __m128i b_new = _mm_and_si128(_mm_add_epi32(b, b_delta), lower_8bit_mask); const __m128i out = _mm_or_si128(_mm_or_si128(ag, r_new_shifted), b_new); _mm_storeu_si128((__m128i*)&argb_data[i], out); } // Fall-back to C-version for left-overs. VP8LTransformColorInverse_C(m, argb_data + i, num_pixels - i); }
/* ===================== R_CopyDecalSurface ===================== */ static void R_CopyDecalSurface( idDrawVert * verts, int numVerts, triIndex_t * indexes, int numIndexes, const decal_t * decal, const float fadeColor[4] ) { assert_16_byte_aligned( &verts[numVerts] ); assert_16_byte_aligned( &indexes[numIndexes] ); assert_16_byte_aligned( decal->indexes ); assert_16_byte_aligned( decal->verts ); assert( ( ( decal->numVerts * sizeof( idDrawVert ) ) & 15 ) == 0 ); assert( ( ( decal->numIndexes * sizeof( triIndex_t ) ) & 15 ) == 0 ); assert_16_byte_aligned( fadeColor ); const __m128i vector_int_num_verts = _mm_shuffle_epi32( _mm_cvtsi32_si128( numVerts ), 0 ); const __m128i vector_short_num_verts = _mm_packs_epi32( vector_int_num_verts, vector_int_num_verts ); const __m128 vector_fade_color = _mm_load_ps( fadeColor ); const __m128i vector_color_mask = _mm_set_epi32( 0, -1, 0, 0 ); // copy vertices and apply depth/time based fading assert_offsetof( idDrawVert, color, 6 * 4 ); for ( int i = 0; i < decal->numVerts; i++ ) { const idDrawVert &srcVert = decal->verts[i]; idDrawVert &dstVert = verts[numVerts + i]; __m128i v0 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 0 ) ); __m128i v1 = _mm_load_si128( (const __m128i *)( (byte *)&srcVert + 16 ) ); __m128 depthFade = _mm_splat_ps( _mm_load_ss( decal->vertDepthFade + i ), 0 ); __m128 timeDepthFade = _mm_mul_ps( depthFade, vector_fade_color ); __m128i colorInt = _mm_cvtps_epi32( timeDepthFade ); __m128i colorShort = _mm_packs_epi32( colorInt, colorInt ); __m128i colorByte = _mm_packus_epi16( colorShort, colorShort ); v1 = _mm_or_si128( v1, _mm_and_si128( colorByte, vector_color_mask ) ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 0 ), v0 ); _mm_stream_si128( (__m128i *)( (byte *)&dstVert + 16 ), v1 ); } // copy indexes assert( ( decal->numIndexes & 7 ) == 0 ); assert( sizeof( triIndex_t ) == 2 ); for ( int i = 0; i < decal->numIndexes; i += 8 ) { __m128i vi = _mm_load_si128( (const __m128i *)&decal->indexes[i] ); vi = _mm_add_epi16( vi, vector_short_num_verts ); _mm_stream_si128( (__m128i *)&indexes[numIndexes + i], vi ); } _mm_sfence(); }
static inline __m128i darken_byte_SSE2(const __m128i& sc, const __m128i& dc, const __m128i& sa, const __m128i& da) { __m128i sd = _mm_mullo_epi16(sc, da); __m128i ds = _mm_mullo_epi16(dc, sa); __m128i cmp = _mm_cmplt_epi32(sd, ds); __m128i tmp = _mm_add_epi32(sc, dc); __m128i ret1 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(ds)); __m128i ret2 = _mm_sub_epi32(tmp, SkDiv255Round_SSE2(sd)); __m128i ret = _mm_or_si128(_mm_and_si128(cmp, ret1), _mm_andnot_si128(cmp, ret2)); return ret; }