static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype1 = _mm_unpacklo_epi32(ptype0, ptype1); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); ptype1 = _mm_slli_epi16(ptype1, PTYPE_SHIFT); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); ptype1 = _mm_or_si128(ptype1, vtag1); vol.dword = _mm_cvtsi128_si64(ptype1) & OLFLAGS_MASK_V; rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
void imageFilterMean_SSE2(unsigned char *src1, unsigned char *src2, unsigned char *dst, int length) { int n = length; // Compute first few values so we're on a 16-byte boundary in dst while( (((long)dst & 0xF) > 0) && (n > 0) ) { MEAN_PIXEL(); --n; ++dst; ++src1; ++src2; } // Do bulk of processing using SSE2 (find the mean of 16 8-bit unsigned integers, with saturation) __m128i mask = _mm_set1_epi8(0x7F); while(n >= 16) { __m128i s1 = _mm_loadu_si128((__m128i*)src1); s1 = _mm_srli_epi16(s1, 1); // shift right 1 s1 = _mm_and_si128(s1, mask); // apply byte-mask __m128i s2 = _mm_loadu_si128((__m128i*)src2); s2 = _mm_srli_epi16(s2, 1); // shift right 1 s2 = _mm_and_si128(s2, mask); // apply byte-mask __m128i r = _mm_adds_epu8(s1, s2); _mm_store_si128((__m128i*)dst, r); n -= 16; src1 += 16; src2 += 16; dst += 16; } // If any bytes are left over, deal with them individually ++n; BASIC_MEAN(); }
//! \brief //! Divide 8 16-bit uints by 255: //! x := ((x + 1) + (x >> 8)) >> 8: //! See: http://www.alfredklomp.com/programming/sse-intrinsics/ //! inline __m128i _mm_div255_epu16(__m128i x) { return _mm_srli_epi16(_mm_adds_epu16( _mm_adds_epu16(x, _mm_set1_epi16(1)), _mm_srli_epi16(x, 8)), 8); }
/* @note: When this function is changed, make corresponding change to * fm10k_dev_supported_ptypes_get(). */ static inline void fm10k_desc_to_pktype_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i l3l4type0, l3l4type1, l3type, l4type; union { uint16_t e[4]; uint64_t dword; } vol; /* L3 pkt type mask Bit4 to Bit6 */ const __m128i l3type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0070, 0x0070, 0x0070, 0x0070); /* L4 pkt type mask Bit7 to Bit9 */ const __m128i l4type_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x0380, 0x0380, 0x0380, 0x0380); /* convert RRC l3 type to mbuf format */ const __m128i l3type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_L3_IPV6_EXT, RTE_PTYPE_L3_IPV6, RTE_PTYPE_L3_IPV4_EXT, RTE_PTYPE_L3_IPV4, 0); /* Convert RRC l4 type to mbuf format l4type_flags shift-left 8 bits * to fill into8 bits length. */ const __m128i l4type_flags = _mm_set_epi8(0, 0, 0, 0, 0, 0, 0, 0, 0, RTE_PTYPE_TUNNEL_GENEVE >> 8, RTE_PTYPE_TUNNEL_NVGRE >> 8, RTE_PTYPE_TUNNEL_VXLAN >> 8, RTE_PTYPE_TUNNEL_GRE >> 8, RTE_PTYPE_L4_UDP >> 8, RTE_PTYPE_L4_TCP >> 8, 0); l3l4type0 = _mm_unpacklo_epi16(descs[0], descs[1]); l3l4type1 = _mm_unpacklo_epi16(descs[2], descs[3]); l3l4type0 = _mm_unpacklo_epi32(l3l4type0, l3l4type1); l3type = _mm_and_si128(l3l4type0, l3type_msk); l4type = _mm_and_si128(l3l4type0, l4type_msk); l3type = _mm_srli_epi16(l3type, L3TYPE_SHIFT); l4type = _mm_srli_epi16(l4type, L4TYPE_SHIFT); l3type = _mm_shuffle_epi8(l3type_flags, l3type); /* l4type_flags shift-left for 8 bits, need shift-right back */ l4type = _mm_shuffle_epi8(l4type_flags, l4type); l4type = _mm_slli_epi16(l4type, 8); l3l4type0 = _mm_or_si128(l3type, l4type); vol.dword = _mm_cvtsi128_si64(l3l4type0); rx_pkts[0]->packet_type = vol.e[0]; rx_pkts[1]->packet_type = vol.e[1]; rx_pkts[2]->packet_type = vol.e[2]; rx_pkts[3]->packet_type = vol.e[3]; }
static inline void packed_shader_to_yuv_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const uint8_t* s = srcp[0]; uint8_t* dr = dstp[0]; uint8_t* dg = dstp[1]; uint8_t* db = dstp[2]; float * buff = reinterpret_cast<float*>(_buff); uint8_t *lr, *lg, *lb; const __m128 coef = _mm_set1_ps(STACK16 ? 65535.0f : 255.0f); const __m128i mask = _mm_set1_epi16(0x00FF); if (STACK16) { lr = dr + height * dpitch; lg = dg + height * dpitch; lb = db + height * dpitch; } for (int y = 0; y < height; ++y) { convert_half_to_float(buff, s, width * 4); for (int x = 0; x < width; x += 4) { __m128i s0 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 0))); // R0,G0,B0,A0 __m128i s1 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 4))); // R1,G1,B1,A1 __m128i s2 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 8))); // R2,G2,B2,A2 __m128i s3 = _mm_cvtps_epi32(_mm_mul_ps(coef, _mm_load_ps(buff + 4 * x + 12))); // R3,G3,B3,A3 s0 = _mm_or_si128(s0, _mm_slli_epi32(s1, 16)); //R0,R1,G0,G1,B0,B1,A0,A1 s1 = _mm_or_si128(s2, _mm_slli_epi32(s3, 16)); //R2,R3,G2,G3,B2,B3,A2,A3 s2 = _mm_unpacklo_epi32(s0, s1); // R0,R1,R2,R3,G0,G1,G2,G3 s3 = _mm_unpackhi_epi32(s0, s1); // B0,B1,B2,B3,A0,A1,A2,A3 if (!STACK16) { s0 = _mm_packus_epi16(s2, s3); *(reinterpret_cast<int32_t*>(dr + x)) = _mm_cvtsi128_si32(s0); *(reinterpret_cast<int32_t*>(dg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(s0, 4)); *(reinterpret_cast<int32_t*>(db + x)) = _mm_cvtsi128_si32(_mm_srli_si128(s0, 8)); } else { __m128i rgbamsb = _mm_packus_epi16(_mm_srli_epi16(s2, 8), _mm_srli_epi16(s3, 8)); __m128i rgbalsb = _mm_packus_epi16(_mm_and_si128(s2, mask), _mm_and_si128(s3, mask)); *(reinterpret_cast<int32_t*>(dr + x)) = _mm_cvtsi128_si32(rgbamsb); *(reinterpret_cast<int32_t*>(lr + x)) = _mm_cvtsi128_si32(rgbalsb); *(reinterpret_cast<int32_t*>(dg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbamsb, 4)); *(reinterpret_cast<int32_t*>(lg + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbalsb, 4)); *(reinterpret_cast<int32_t*>(db + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbamsb, 8)); *(reinterpret_cast<int32_t*>(lb + x)) = _mm_cvtsi128_si32(_mm_srli_si128(rgbalsb, 8)); } } s += spitch; dr += dpitch; dg += dpitch; db += dpitch; if (STACK16) { lr += dpitch; lg += dpitch; lb += dpitch; } } }
SIMD_INLINE void Average16(__m128i a[2][2]) { a[0][0] = _mm_srli_epi16(_mm_add_epi16(a[0][0], K16_0001), 1); a[0][1] = _mm_srli_epi16(_mm_add_epi16(a[0][1], K16_0001), 1); a[1][0] = _mm_srli_epi16(_mm_add_epi16(a[1][0], K16_0001), 1); a[1][1] = _mm_srli_epi16(_mm_add_epi16(a[1][1], K16_0001), 1); }
void Lerp_SSE2(void* dest, const void* source1, const void* source2, float alpha, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); static const __m128i round = _mm_set1_epi16(128); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); assert(alpha >= 0.0 && alpha <= 1.0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i s = _mm_setzero_si128(); __m128i d = _mm_setzero_si128(); const __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); __m128i drb, dga, srb, sga; for (size_t k = 0, length = size/stride; k < length; ++k) { _mm_prefetch(reinterpret_cast<const char*>(source128_1 + PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const char*>(source128_2 + PSD), _MM_HINT_NTA); // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // r = d + (s-d)*alpha/256 s = _mm_load_si128(source128_1); // AABBGGRR d = _mm_load_si128(source128_2); // AABBGGRR srb = _mm_and_si128(lomask, s); // 00BB00RR // unpack sga = _mm_srli_epi16(s, 8); // AA00GG00 // unpack drb = _mm_and_si128(lomask, d); // 00BB00RR // unpack dga = _mm_srli_epi16(d, 8); // AA00GG00 // unpack srb = _mm_sub_epi16(srb, drb); // BBBBRRRR // sub srb = _mm_mullo_epi16(srb, a); // BBBBRRRR // mul srb = _mm_add_epi16(srb, round); sga = _mm_sub_epi16(sga, dga); // AAAAGGGG // sub sga = _mm_mullo_epi16(sga, a); // AAAAGGGG // mul sga = _mm_add_epi16(sga, round); srb = _mm_srli_epi16(srb, 8); // 00BB00RR // prepack and div sga = _mm_andnot_si128(lomask, sga);// AA00GG00 // prepack and div srb = _mm_or_si128(srb, sga); // AABBGGRR // pack srb = _mm_add_epi8(srb, d); // AABBGGRR // add there is no overflow(R.N) _mm_store_si128(dest128, srb); } } }
void PreOver_FastSSE2(void* dest, const void* source1, const void* source2, size_t size) { static const size_t stride = sizeof(__m128i)*4; static const u32 PSD = 64; static const __m128i lomask = _mm_set1_epi32(0x00FF00FF); assert(source1 != NULL && source2 != NULL && dest != NULL); assert(size % stride == 0); const __m128i* source128_1 = reinterpret_cast<const __m128i*>(source1); const __m128i* source128_2 = reinterpret_cast<const __m128i*>(source2); __m128i* dest128 = reinterpret_cast<__m128i*>(dest); __m128i d, s, a, rb, ag; // TODO: dynamic prefetch schedluing distance? needs to be optimized (R.N) for(int k = 0, length = size/stride; k < length; ++k) { // TODO: put prefetch between calculations?(R.N) _mm_prefetch(reinterpret_cast<const s8*>(source128_1+PSD), _MM_HINT_NTA); _mm_prefetch(reinterpret_cast<const s8*>(source128_2+PSD), _MM_HINT_NTA); //work on entire cacheline before next prefetch for(int n = 0; n < 4; ++n, ++dest128, ++source128_1, ++source128_2) { // TODO: assembly optimization use PSHUFD on moves before calculations, lower latency than MOVDQA (R.N) http://software.intel.com/en-us/articles/fast-simd-integer-move-for-the-intel-pentiumr-4-processor/ s = _mm_load_si128(source128_1); // AABGGRR d = _mm_load_si128(source128_2); // AABGGRR // set alpha to lo16 from dest_ rb = _mm_srli_epi32(d, 24); // 000000AA a = _mm_slli_epi32(rb, 16); // 00AA0000 a = _mm_or_si128(rb, a); // 00AA00AA // fix alpha a = a > 127 ? a+1 : a // NOTE: If removed an *overflow* will occur with large values (R.N) rb = _mm_srli_epi16(a, 7); a = _mm_add_epi16(a, rb); rb = _mm_and_si128(lomask, s); // 00B00RR unpack rb = _mm_mullo_epi16(rb, a); // BBRRRR mul (D[A]*S) rb = _mm_srli_epi16(rb, 8); // 00B00RR prepack and div [(D[A]*S)]/255 ag = _mm_srli_epi16(s, 8); // 00AA00GG unpack ag = _mm_mullo_epi16(ag, a); // AAAAGGGG mul (D[A]*S) ag = _mm_andnot_si128(lomask, ag); // AA00GG00 prepack and div [(D[A]*S)]/255 rb = _mm_or_si128(rb, ag); // AABGGRR pack rb = _mm_sub_epi8(s, rb); // sub S-[(D[A]*S)/255] d = _mm_add_epi8(d, rb); // add D+[S-(D[A]*S)/255] _mm_store_si128(dest128, d); } } }
__m64 interpolvline_1( unsigned char* image, int PicWidthInPix){ __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; __m64 ret; xmm7 = _mm_setzero_si128(); xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix))); xmm0 = _mm_unpacklo_epi8(xmm0,xmm7); xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix))); xmm1 = _mm_unpacklo_epi8(xmm1,xmm7); xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix))); xmm2 = _mm_unpacklo_epi8(xmm2,xmm7); xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix))); xmm3 = _mm_unpacklo_epi8(xmm3,xmm7); xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix))); xmm4 = _mm_unpacklo_epi8(xmm4,xmm7); xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix))); xmm5 = _mm_unpacklo_epi8(xmm5,xmm7); // filter on 8 values xmm6 = _mm_add_epi16(xmm2,xmm3); xmm6 = _mm_slli_epi16(xmm6,2); xmm6 = _mm_sub_epi16(xmm6,xmm1); xmm6 = _mm_sub_epi16(xmm6,xmm4); xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005); xmm6 = _mm_mullo_epi16(xmm6,xmm1); xmm6 = _mm_add_epi16(xmm6,xmm0); xmm6 = _mm_add_epi16(xmm6,xmm5); xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010)); xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values xmm6 = _mm_srli_epi16(xmm6,5); xmm2 = _mm_packus_epi16(xmm2,xmm7); xmm3 = _mm_packus_epi16(xmm3,xmm7); xmm6 = _mm_packus_epi16(xmm6,xmm7); xmm5 = _mm_unpacklo_epi8(xmm2,xmm6); xmm4 = _mm_unpacklo_epi8(xmm6,xmm3); xmm6 = _mm_avg_epu8(xmm4,xmm5); xmm6 = _mm_slli_epi16(xmm6,8); xmm6 = _mm_srli_epi16(xmm6,8); xmm6 = _mm_packus_epi16(xmm6,xmm7); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }
__m64 interpolhline_1(unsigned char* image){ __m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7; unsigned char* imagetmp = image - 2; __m64 ret; xmm7 = _mm_setzero_si128(); xmm6 = _mm_loadu_si128(((__m128i*)imagetmp)); xmm0 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm1 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm2 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm3 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm4 = _mm_unpacklo_epi8(xmm6,xmm7); xmm6 = _mm_srli_si128(xmm6,1); xmm5 = _mm_unpacklo_epi8(xmm6,xmm7); // filter on 8 values xmm6 = _mm_add_epi16(xmm2,xmm3);//(C + D) xmm6 = _mm_slli_epi16(xmm6,2);//(C + D) << 2 xmm6 = _mm_sub_epi16(xmm6,xmm1);//((C + D) << 2) - B xmm6 = _mm_sub_epi16(xmm6,xmm4);//((C + D) << 2) - B - E xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005); xmm6 = _mm_mullo_epi16(xmm6,xmm1);//(((C + D) << 2) - B - E) * 5 xmm6 = _mm_add_epi16(xmm6,xmm0);//((((C + D) << 2) - B - E) * 5) + A xmm6 = _mm_add_epi16(xmm6,xmm5);//((((C + D) << 2) - B - E) * 5) + A + F xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));//((((C + D) << 2) - B - E) * 5) + A + F + 16 xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values Clip255_16 xmm6 = _mm_srli_epi16(xmm6,5); // result0 >> 5 xmm2 = _mm_packus_epi16(xmm2,xmm7); xmm3 = _mm_packus_epi16(xmm3,xmm7); xmm6 = _mm_packus_epi16(xmm6,xmm7); xmm5 = _mm_unpacklo_epi8(xmm2,xmm6); xmm4 = _mm_unpacklo_epi8(xmm6,xmm3); xmm6 = _mm_avg_epu8(xmm4,xmm5); xmm6 = _mm_slli_epi16(xmm6,8); xmm6 = _mm_srli_epi16(xmm6,8); xmm6 = _mm_packus_epi16(xmm6,xmm7); ret = _mm_movepi64_pi64(xmm6); _mm_empty(); return(ret); }
void ColorModelView::paintEvent(QPaintEvent *) { QPainter p(this); auto mainBounds = mainAreaBounds(); auto sideBounds = sideAreaBounds(); if (mainImage_.isNull()) { // FIXME: support other color model? QImage img(256, 256, QImage::Format_RGB32); auto *pixels = reinterpret_cast<quint32 *>(img.bits()); auto basecolor = QColor::fromHsv(value_.hsvHue(), 255, 255); auto basecolorMM = _mm_setr_epi32(basecolor.blue(), basecolor.green(), basecolor.red(), 0); basecolorMM = _mm_add_epi32(basecolorMM, _mm_srli_epi32(basecolorMM, 7)); // map [0, 255] to [0, 256] auto white = _mm_set1_epi32(256 * 255); auto dX = _mm_sub_epi32(basecolorMM, _mm_set1_epi32(256)); for (int y = 0; y < 256; ++y) { auto brightness = _mm_set1_epi32(256 - y - (y >> 7)); auto col = white; // [0, 256 * 255] for (int x = 0; x < 256; ++x) { auto c = _mm_mullo_epi16(_mm_srli_epi32(col, 8), brightness); c = _mm_srli_epi16(c, 8); // [0, 255] c = _mm_packs_epi32(c, c); c = _mm_packus_epi16(c, c); _mm_store_ss(reinterpret_cast<float *>(&pixels[x + y * 256]), _mm_castsi128_ps(c)); col = _mm_add_epi32(col, dX); } } mainImage_ = QPixmap::fromImage(img); }
static INLINE void SIGNED_CLAMP_SUB(pi16 VD, pi16 VS, pi16 VT) { v16 dst, src, vco; v16 dif, res, xmm; src = _mm_load_si128((v16 *)VS); dst = _mm_load_si128((v16 *)VT); vco = _mm_load_si128((v16 *)cf_co); res = _mm_subs_epi16(src, dst); /* * Due to premature clamps in-between subtracting two of the three operands, * we must be careful not to offset the result accidentally when subtracting * the corresponding VCO flag AFTER the saturation from doing (VS - VT). */ dif = _mm_add_epi16(res, vco); dif = _mm_xor_si128(dif, res); /* Adding one suddenly inverts the sign? */ dif = _mm_and_si128(dif, dst); /* Sign change due to subtracting a neg. */ xmm = _mm_sub_epi16(src, dst); src = _mm_andnot_si128(src, dif); /* VS must be >= 0x0000 for overflow. */ xmm = _mm_and_si128(xmm, src); /* VS + VT != INT16_MIN; VS + VT >= +32768 */ xmm = _mm_srli_epi16(xmm, 15); /* src = (INT16_MAX + 1 === INT16_MIN) ? */ xmm = _mm_andnot_si128(xmm, vco); /* If it's NOT overflow, keep flag. */ res = _mm_subs_epi16(res, xmm); _mm_store_si128((v16 *)VD, res); return; }
static void ConvertBGRAToRGBA4444_SSE2(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i mask_0x0f = _mm_set1_epi8(0x0f); const __m128i mask_0xf0 = _mm_set1_epi8(0xf0); const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i ga1 = _mm_srli_epi16(ga0, 4); // g0-|g1-|...|a6-|a7- const __m128i rb1 = _mm_and_si128(rb0, mask_0xf0); // -r0|-r1|...|-b6|-a7 const __m128i ga2 = _mm_and_si128(ga1, mask_0x0f); // g0-|g1-|...|a6-|a7- const __m128i rgba0 = _mm_or_si128(ga2, rb1); // rg0..rg7 | ba0..ba7 const __m128i rgba1 = _mm_srli_si128(rgba0, 8); // ba0..ba7 | 0 #if (WEBP_SWAP_16BIT_CSP == 1) const __m128i rgba = _mm_unpacklo_epi8(rgba1, rgba0); // barg0...barg7 #else const __m128i rgba = _mm_unpacklo_epi8(rgba0, rgba1); // rgba0...rgba7 #endif _mm_storeu_si128(out++, rgba); num_pixels -= 8; } // left-overs if (num_pixels > 0) { VP8LConvertBGRAToRGBA4444_C((const uint32_t*)in, num_pixels, (uint8_t*)out); } }
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 8; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set1_epi16(1 << 7); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i v1 = _mm_unpacklo_epi8(v0, zero); const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); const __m128i v3 = _mm_mullo_epi16(v1, alpha1); const __m128i v4 = _mm_adds_epu16(v2, v3); const __m128i v5 = _mm_adds_epu16(v4, kRound); const __m128i v6 = _mm_srli_epi16(v5, 8); const __m128i v7 = _mm_packus_epi16(v6, zero); _mm_storel_epi64((__m128i*)&ptr[x], v7); } } width -= x; if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
static WEBP_INLINE __m128i Average2_uint32_16_SSE2(uint32_t a0, uint32_t a1) { const __m128i zero = _mm_setzero_si128(); const __m128i A0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a0), zero); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(A1, A0); return _mm_srli_epi16(sum, 1); }
// Predictors13: ClampedAddSubtractHalf static void PredictorSub13_SSE2(const uint32_t* in, const uint32_t* upper, int num_pixels, uint32_t* out) { int i; const __m128i zero = _mm_setzero_si128(); for (i = 0; i + 2 <= num_pixels; i += 2) { // we can only process two pixels at a time const __m128i L = _mm_loadl_epi64((const __m128i*)&in[i - 1]); const __m128i src = _mm_loadl_epi64((const __m128i*)&in[i]); const __m128i T = _mm_loadl_epi64((const __m128i*)&upper[i]); const __m128i TL = _mm_loadl_epi64((const __m128i*)&upper[i - 1]); const __m128i L_lo = _mm_unpacklo_epi8(L, zero); const __m128i T_lo = _mm_unpacklo_epi8(T, zero); const __m128i TL_lo = _mm_unpacklo_epi8(TL, zero); const __m128i sum = _mm_add_epi16(T_lo, L_lo); const __m128i avg = _mm_srli_epi16(sum, 1); const __m128i A1 = _mm_sub_epi16(avg, TL_lo); const __m128i bit_fix = _mm_cmpgt_epi16(TL_lo, avg); const __m128i A2 = _mm_sub_epi16(A1, bit_fix); const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(avg, A3); const __m128i pred = _mm_packus_epi16(A4, A4); const __m128i res = _mm_sub_epi8(src, pred); _mm_storel_epi64((__m128i*)&out[i], res); } if (i != num_pixels) { VP8LPredictorsSub_C[13](in + i, upper + i, num_pixels - i, out + i); } }
static void TransformColorInverse_SSE2(const VP8LMultipliers* const m, const uint32_t* const src, int num_pixels, uint32_t* dst) { // sign-extended multiplying constants, pre-shifted by 5. #define CST(X) (((int16_t)(m->X << 8)) >> 5) // sign-extend #define MK_CST_16(HI, LO) \ _mm_set1_epi32((int)(((uint32_t)(HI) << 16) | ((LO) & 0xffff))) const __m128i mults_rb = MK_CST_16(CST(green_to_red_), CST(green_to_blue_)); const __m128i mults_b2 = MK_CST_16(CST(red_to_blue_), 0); #undef MK_CST_16 #undef CST const __m128i mask_ag = _mm_set1_epi32(0xff00ff00); // alpha-green masks int i; for (i = 0; i + 4 <= num_pixels; i += 4) { const __m128i in = _mm_loadu_si128((const __m128i*)&src[i]); // argb const __m128i A = _mm_and_si128(in, mask_ag); // a 0 g 0 const __m128i B = _mm_shufflelo_epi16(A, _MM_SHUFFLE(2, 2, 0, 0)); const __m128i C = _mm_shufflehi_epi16(B, _MM_SHUFFLE(2, 2, 0, 0)); // g0g0 const __m128i D = _mm_mulhi_epi16(C, mults_rb); // x dr x db1 const __m128i E = _mm_add_epi8(in, D); // x r' x b' const __m128i F = _mm_slli_epi16(E, 8); // r' 0 b' 0 const __m128i G = _mm_mulhi_epi16(F, mults_b2); // x db2 0 0 const __m128i H = _mm_srli_epi32(G, 8); // 0 x db2 0 const __m128i I = _mm_add_epi8(H, F); // r' x b'' 0 const __m128i J = _mm_srli_epi16(I, 8); // 0 r' 0 b'' const __m128i out = _mm_or_si128(J, A); _mm_storeu_si128((__m128i*)&dst[i], out); } // Fall-back to C-version for left-overs. if (i != num_pixels) { VP8LTransformColorInverse_C(m, src + i, num_pixels - i, dst + i); } }
inline COLORREF MakeColor2(COLORREF a, COLORREF b, int alpha) { #ifdef USE_SSE2 // (a * alpha + b * (256 - alpha)) / 256 -> ((a - b) * alpha) / 256 + b __m128i xmm0, xmm1, xmm2, xmm3; COLORREF color; xmm0 = _mm_setzero_si128(); xmm1 = _mm_cvtsi32_si128( a ); xmm2 = _mm_cvtsi32_si128( b ); xmm3 = _mm_cvtsi32_si128( alpha ); xmm1 = _mm_unpacklo_epi8( xmm1, xmm0 ); // a:a:a:a xmm2 = _mm_unpacklo_epi8( xmm2, xmm0 ); // b:b:b:b xmm3 = _mm_shufflelo_epi16( xmm3, 0 ); // alpha:alpha:alpha:alpha xmm1 = _mm_sub_epi16( xmm1, xmm2 ); // (a - b) xmm1 = _mm_mullo_epi16( xmm1, xmm3 ); // (a - b) * alpha xmm1 = _mm_srli_epi16( xmm1, 8 ); // ((a - b) * alpha) / 256 xmm1 = _mm_add_epi8( xmm1, xmm2 ); // ((a - b) * alpha) / 256 + b xmm1 = _mm_packus_epi16( xmm1, xmm0 ); color = _mm_cvtsi128_si32( xmm1 ); return color; #else const int ap = alpha; const int bp = 256 - ap; BYTE valR = (BYTE)((GetRValue(a) * ap + GetRValue(b) * bp) / 256); BYTE valG = (BYTE)((GetGValue(a) * ap + GetGValue(b) * bp) / 256); BYTE valB = (BYTE)((GetBValue(a) * ap + GetBValue(b) * bp) / 256); return RGB(valR, valG, valB); #endif }
__m128i test_mm_srli_epi16(__m128i A) { // DAG-LABEL: test_mm_srli_epi16 // DAG: call <8 x i16> @llvm.x86.sse2.psrli.w // // ASM-LABEL: test_mm_srli_epi16 // ASM: psrlw return _mm_srli_epi16(A, 1); }
/* ----------------------------------- * weighted_merge_chroma_yuy2 * ----------------------------------- */ static void weighted_merge_chroma_yuy2_sse2(BYTE *src, const BYTE *chroma, int pitch, int chroma_pitch,int width, int height, int weight, int invweight ) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); __m128i luma_mask = _mm_set1_epi16(0x00FF); int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(chroma+x)); __m128i src_lo = _mm_unpacklo_epi16(px1, px2); __m128i src_hi = _mm_unpackhi_epi16(px1, px2); src_lo = _mm_srli_epi16(src_lo, 8); src_hi = _mm_srli_epi16(src_hi, 8); src_lo = _mm_madd_epi16(src_lo, mask); src_hi = _mm_madd_epi16(src_hi, mask); src_lo = _mm_add_epi32(src_lo, round_mask); src_hi = _mm_add_epi32(src_hi, round_mask); src_lo = _mm_srli_epi32(src_lo, 15); src_hi = _mm_srli_epi32(src_hi, 15); __m128i result_chroma = _mm_packs_epi32(src_lo, src_hi); result_chroma = _mm_slli_epi16(result_chroma, 8); __m128i result_luma = _mm_and_si128(px1, luma_mask); __m128i result = _mm_or_si128(result_chroma, result_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = wMod16; x < width; x+=2) { src[x+1] = (chroma[x+1] * weight + src[x+1] * invweight + 16384) >> 15; } src += pitch; chroma += chroma_pitch; } }
// Author: Niclas P Andersson void Lerp_OLD(void* dest, const void* source1, const void* source2, float alpha, size_t size) { __m128i ps1, ps2, pd1, pd2, m0, m1, pr1, pr2; __m128i* pSource = (__m128i*)source1; __m128i* pDest = (__m128i*)source2; __m128i* pResult = (__m128i*)dest; __m128i a = _mm_set1_epi16(static_cast<u8>(alpha*256.0f+0.5f)); m0 = _mm_setzero_si128(); int count = size/4; for ( int i = 0; i < count; i+=4 ) { ps1 = _mm_load_si128(pSource); //load 4 pixels from source pd1 = _mm_load_si128(pDest); //load 4 pixels from dest ps2 = _mm_unpackhi_epi64(ps1, m0); //move the 2 high pixels from source pd2 = _mm_unpackhi_epi64(pd1, m0); //move the 2 high pixels from dest //compute the 2 "lower" pixels ps1 = _mm_unpacklo_epi8(ps1, m0); //unpack the 2 low pixels from source (bytes -> words) pd1 = _mm_unpacklo_epi8(pd1, m0); //unpack the 2 low pixels from dest (bytes -> words) pr1 = _mm_sub_epi16(ps1, pd1); //x = src - dest pr1 = _mm_mullo_epi16(pr1, a); //y = x*alpha pr1 = _mm_srli_epi16(pr1, 8); //w = y/256 pr1 = _mm_add_epi8(pr1, pd1); //z = w + dest //same thing for the 2 "high" pixels ps2 = _mm_unpacklo_epi8(ps2, m0); pd2 = _mm_unpacklo_epi8(pd2, m0); pr2 = _mm_sub_epi16(ps2, pd2); //x = src - dest pr2 = _mm_mullo_epi16(pr2, a); //y = x*alpha pr2 = _mm_srli_epi16(pr2, 8); //w = y/256 pr2 = _mm_add_epi8(pr2, pd2); //z = w + dest m1 = _mm_packus_epi16(pr1, pr2); //pack all 4 together again (words -> bytes) _mm_store_si128(pResult, m1); pSource++; pDest++; pResult++; } }
SIMDValue SIMDUint16x8Operation::OpShiftRightByScalar(const SIMDValue& value, int count) { X86SIMDValue x86Result = { { 0, 0, 0, 0 } }; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(value); x86Result.m128i_value = _mm_srli_epi16(tmpaValue.m128i_value, count & SIMDUtils::SIMDGetShiftAmountMask(2)); return X86SIMDValue::ToSIMDValue(x86Result); }
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0, const Uint8* source1, Uint8* dest) { __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = _mm_load_si128((__m128i*)&source0[i * 16]); t1 = _mm_load_si128((__m128i*)&source1[i * 16]); t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t2 = _mm_unpacklo_epi8(t2, t2); t2 = _mm_unpacklo_epi16(t2, t2); t3 = _mm_unpacklo_epi8(t0, t0); t4 = _mm_unpacklo_epi8(t1, t1); t5 = _mm_unpacklo_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t9 = _mm_adds_epu16(t7, t8); t9 = _mm_srli_epi16(t9, 8); t3 = _mm_unpackhi_epi8(t0, t0); t4 = _mm_unpackhi_epi8(t1, t1); t5 = _mm_unpackhi_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t10 = _mm_adds_epu16(t7, t8); t10 = _mm_srli_epi16(t10, 8); t10 = _mm_packus_epi16(t9, t10); _mm_stream_si128((__m128i*)&dest[i * 16], t10); } }
__m64 _m_psrlwi(__m64 _M, int _Count) { __m128i lhs = {0}; lhs.m128i_i64[0] = _M.m64_i64; lhs = _mm_srli_epi16(lhs, _Count); _M.m64_i64 = lhs.m128i_i64[0]; return _M; }
static WEBP_INLINE uint32_t Average4_SSE2(uint32_t a0, uint32_t a1, uint32_t a2, uint32_t a3) { const __m128i avg1 = Average2_uint32_16_SSE2(a0, a1); const __m128i avg2 = Average2_uint32_16_SSE2(a2, a3); const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); const uint32_t output = _mm_cvtsi128_si32(A0); return output; }
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { const __m128i zero = _mm_setzero_si128(); const __m128i avg1 = Average2_128i(a0, a2); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); const __m128i A2 = _mm_packus_epi16(avg2, avg2); const uint32_t output = _mm_cvtsi128_si32(A2); return output; }
void png_read_filter_row_paeth3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i npix = _mm_cvtsi32_si128(*(uint32_t*)rp); __m128i ppix = _mm_setzero_si128(); // Same as 'a' in C version. __m128i prppix = _mm_setzero_si128(); // Same as 'c' in C version. const __m128i zero = _mm_setzero_si128(); for (i = 0; i < row_info->rowbytes; i += 3, rp += 3, prp += 3) { __m128i prpix = _mm_cvtsi32_si128(*(uint32_t*)prp); // Same as 'b' in C ver. __m128i pix, pa, pb, pc, temp; prpix = _mm_unpacklo_epi8(prpix, zero); temp = _mm_sub_epi16(prpix, prppix); // p = b - c pc = _mm_sub_epi16(ppix, prppix); // pc = a - c #ifndef __SSSE3__ pa = _mm_max_epi16(temp, _mm_sub_epi16(prppix, prpix)); pb = _mm_max_epi16(pc, _mm_sub_epi16(prppix, ppix)); temp = _mm_add_epi16(temp, pc); pc = _mm_max_epi16(temp, _mm_sub_epi16(zero, temp)); #else pa = _mm_abs_epi16(temp); // pa = abs(p) pb = _mm_abs_epi16(pc); // pb = abs(pc) temp = _mm_add_epi16(temp, pc); pc = _mm_abs_epi16(temp); // pc = abs(p + pc) #endif temp = _mm_cmplt_epi16(pb, pa); // if (pb < pa) pa = pb, a = b pa = _mm_andnot_si128(temp, pa); pa = _mm_or_si128(pa, _mm_and_si128(temp, pb)); ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prpix)); pix = npix; npix = _mm_cvtsi32_si128(*(uint32_t*)(rp + 3)); temp = _mm_cmplt_epi16(pc, pa); // if (pc < pa) a = c ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prppix)); pix = _mm_unpacklo_epi8(pix, zero); prppix = prpix; ppix = _mm_add_epi16(ppix, pix); ppix = _mm_slli_epi16(ppix, 8); ppix = _mm_srli_epi16(ppix, 8); pix = _mm_packus_epi16(ppix, zero); *(uint32_t*)rp = _mm_cvtsi128_si32(pix); } }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int32_t *idata = (int32_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi32(1); const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1)); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]); __m128i A0 = _mm_unpacklo_epi32(D0, D4); __m128i A2 = _mm_unpackhi_epi32(D0, D4); __m128i E0 = _mm_unpacklo_epi32(A0, A2); __m128i O1 = _mm_unpackhi_epi32(A0, A2); __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1)); __m128i X1 = _mm_add_epi32(O1, X0); __m128i Z0 = _mm_unpacklo_epi32(X0, X1); __m128i Z4 = _mm_unpackhi_epi32(X0, X1); if (shift != 0) { Z0 = _mm_add_epi32(Z0, ONE); Z4 = _mm_add_epi32(Z4, ONE); Z0 = _mm_srai_epi32(Z0, shift); Z4 = _mm_srai_epi32(Z4, shift); } Z0 = _mm_add_epi32(Z0, OFFSET); Z4 = _mm_add_epi32(Z4, OFFSET); Z0 = _mm_slli_epi32(Z0, (16 - active_bits)); Z4 = _mm_slli_epi32(Z4, (16 - active_bits)); __m128i R = _mm_packus_epi32(Z0, Z4); R = _mm_srli_epi16(R, (16 - active_bits)); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R); } } }
void Coefs(unsigned char *current_part_ptr, int current_part_stride, unsigned char *ref_part_ptr, int ref_part_stride, unsigned char *coef_buf, int n) { static const unsigned short c_32[8] = {32, 32, 32, 32, 32, 32, 32, 32}; int i; __m128i v_row0_0, v_row0_1; __m128i v_temp_0, v_temp_1; __m128i v_result; __m128i vZero; vZero = _mm_setzero_si128(); __m128i v_32 = _mm_loadu_si128((__m128i*)c_32); __m128i* coef_ptr = (__m128i*) coef_buf; v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; // row0: 0 1 2 3 4 5 6 7 // row1: 2 3 4 5 6 7 8 9 v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); for ( i = 0; i < n; i++ ) { v_row0_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[0]); v_row0_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[1]); v_result = v_32; v_result = _mm_add_epi16(v_result, v_row0_0); v_result = _mm_add_epi16(v_result, v_row0_1); v_row0_0 = _mm_loadl_epi64((__m128i*)ref_part_ptr); v_row0_1 = _mm_shufflelo_epi16(v_row0_0, 0xf9); v_row0_1 = _mm_insert_epi16(v_row0_1, *(unsigned short*)(ref_part_ptr+8), 3); ref_part_ptr += ref_part_stride; v_row0_0 = _mm_unpacklo_epi8(v_row0_0, vZero); v_row0_1 = _mm_unpacklo_epi8(v_row0_1, vZero); v_temp_0 = _mm_mullo_epi16(v_row0_0, coef_ptr[2]); v_temp_1 = _mm_mullo_epi16(v_row0_1, coef_ptr[3]); v_result = _mm_add_epi16(v_result, v_temp_0); v_result = _mm_add_epi16(v_result, v_temp_1); v_result = _mm_srli_epi16(v_result, 6); _mm_store_si128((__m128i*)(current_part_ptr), v_result); current_part_ptr += current_part_stride; } }