static inline void desc_to_olflags_v(__m128i descs[4], struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype1 = _mm_unpacklo_epi32(ptype0, ptype1); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); ptype1 = _mm_slli_epi16(ptype1, PTYPE_SHIFT); vtag1 = _mm_srli_epi16(vtag1, VTAG_SHIFT); ptype1 = _mm_or_si128(ptype1, vtag1); vol.dword = _mm_cvtsi128_si64(ptype1) & OLFLAGS_MASK_V; rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
void vpx_highbd_d207_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_load_si128((const __m128i *)left); const __m128i A1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i LR0 = _mm_shufflehi_epi16(A1, 0xff); const __m128i LR = _mm_unpackhi_epi64(LR0, LR0); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_alignr_epi8(LR, A1, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_alignr_epi8(LR, A1, 4); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i avg2_0 = _mm_avg_epu16(A0, B0); const __m128i avg2_1 = _mm_avg_epu16(A1, B1); const __m128i out_a = _mm_unpacklo_epi16(avg2_0, avg3_0); const __m128i out_b = _mm_unpackhi_epi16(avg2_0, avg3_0); const __m128i out_c = _mm_unpacklo_epi16(avg2_1, avg3_1); const __m128i out_d = _mm_unpackhi_epi16(avg2_1, avg3_1); (void)above; (void)bd; d207_store_4x16(&dst, stride, &out_a, &out_b, &out_c); d207_store_4x16(&dst, stride, &out_b, &out_c, &out_d); d207_store_4x16(&dst, stride, &out_c, &out_d, &LR); d207_store_4x16(&dst, stride, &out_d, &LR, &LR); }
void ConvertInt8ToFloat(__m128i in_input, __m128 *out_output) { __m128i input_16, input_32; // convert first half to 16bit integer input_16 = _mm_unpacklo_epi8(in_input, ZERO); // convert first fourth to 32bit interger input_32 = _mm_unpacklo_epi16(input_16, ZERO); // convert first fourth to 32bit floating point value out_output[0] = _mm_cvtepi32_ps(input_32); // convert second fourth to 32bit integer input_32 = _mm_unpackhi_epi16(input_16, ZERO); // convert second fourth to 32bit floating point value out_output[1] = _mm_cvtepi32_ps(input_32); // convert second half to 16bit integer input_16 = _mm_unpackhi_epi8(in_input, ZERO); // convert third fourth to 32bit interger input_32 = _mm_unpacklo_epi16(input_16, ZERO); // convert third fourth to 32bit floating point value out_output[2] = _mm_cvtepi32_ps(input_32); // convert fourth fourth to 32bit integer input_32 = _mm_unpackhi_epi16(input_16, ZERO); // convert fourth fourth to 32bit floating point value out_output[3] = _mm_cvtepi32_ps(input_32); }
inline FORCE_INLINE void transpose8_epi16(__m128i &x0, __m128i &x1, __m128i &x2, __m128i &x3, __m128i &x4, __m128i &x5, __m128i &x6, __m128i &x7) { __m128i t0, t1, t2, t3, t4, t5, t6, t7; __m128i tt0, tt1, tt2, tt3, tt4, tt5, tt6, tt7; t0 = _mm_unpacklo_epi16(x0, x1); t1 = _mm_unpacklo_epi16(x2, x3); t2 = _mm_unpacklo_epi16(x4, x5); t3 = _mm_unpacklo_epi16(x6, x7); t4 = _mm_unpackhi_epi16(x0, x1); t5 = _mm_unpackhi_epi16(x2, x3); t6 = _mm_unpackhi_epi16(x4, x5); t7 = _mm_unpackhi_epi16(x6, x7); tt0 = _mm_unpacklo_epi32(t0, t1); tt1 = _mm_unpackhi_epi32(t0, t1); tt2 = _mm_unpacklo_epi32(t2, t3); tt3 = _mm_unpackhi_epi32(t2, t3); tt4 = _mm_unpacklo_epi32(t4, t5); tt5 = _mm_unpackhi_epi32(t4, t5); tt6 = _mm_unpacklo_epi32(t6, t7); tt7 = _mm_unpackhi_epi32(t6, t7); x0 = _mm_unpacklo_epi64(tt0, tt2); x1 = _mm_unpackhi_epi64(tt0, tt2); x2 = _mm_unpacklo_epi64(tt1, tt3); x3 = _mm_unpackhi_epi64(tt1, tt3); x4 = _mm_unpacklo_epi64(tt4, tt6); x5 = _mm_unpackhi_epi64(tt4, tt6); x6 = _mm_unpacklo_epi64(tt5, tt7); x7 = _mm_unpackhi_epi64(tt5, tt7); }
static void DispatchAlphaToGreen(const uint8_t* alpha, int alpha_stride, int width, int height, uint32_t* dst, int dst_stride) { int i, j; const __m128i zero = _mm_setzero_si128(); const int limit = width & ~15; for (j = 0; j < height; ++j) { for (i = 0; i < limit; i += 16) { // process 16 alpha bytes const __m128i a0 = _mm_loadu_si128((const __m128i*)&alpha[i]); const __m128i a1 = _mm_unpacklo_epi8(zero, a0); // note the 'zero' first! const __m128i b1 = _mm_unpackhi_epi8(zero, a0); const __m128i a2_lo = _mm_unpacklo_epi16(a1, zero); const __m128i b2_lo = _mm_unpacklo_epi16(b1, zero); const __m128i a2_hi = _mm_unpackhi_epi16(a1, zero); const __m128i b2_hi = _mm_unpackhi_epi16(b1, zero); _mm_storeu_si128((__m128i*)&dst[i + 0], a2_lo); _mm_storeu_si128((__m128i*)&dst[i + 4], a2_hi); _mm_storeu_si128((__m128i*)&dst[i + 8], b2_lo); _mm_storeu_si128((__m128i*)&dst[i + 12], b2_hi); } for (; i < width; ++i) dst[i] = alpha[i] << 8; alpha += alpha_stride; dst += dst_stride; } }
static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) { __m128i v0, v1; __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); u[0] = _mm_loadu_si128((__m128i const *)src); u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); u[0] = _mm_add_epi32(u[0], rnd); u[1] = _mm_add_epi32(u[1], rnd); u[2] = _mm_add_epi32(u[2], rnd); u[3] = _mm_add_epi32(u[3], rnd); u[0] = _mm_srai_epi32(u[0], FILTER_BITS); u[1] = _mm_srai_epi32(u[1], FILTER_BITS); u[2] = _mm_srai_epi32(u[2], FILTER_BITS); u[3] = _mm_srai_epi32(u[3], FILTER_BITS); u[0] = _mm_packus_epi32(u[0], u[1]); u[1] = _mm_packus_epi32(u[2], u[3]); highbd_clip(u, 2, bd); v0 = _mm_unpacklo_epi16(u[0], u[1]); v1 = _mm_unpackhi_epi16(u[0], u[1]); u[0] = _mm_unpacklo_epi16(v0, v1); u[2] = _mm_unpackhi_epi16(v0, v1); u[1] = _mm_srli_si128(u[0], 8); u[3] = _mm_srli_si128(u[2], 8); }
static void GF_FUNC_ALIGN VS_CC convert_to_float_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff, float *dstp, int width, int height, int src_stride, int dst_stride) { __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128i xmm0 = _mm_load_si128((__m128i *)(srcp + x)); __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); __m128 f1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); __m128 f2 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero)); __m128 f3 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero)); _mm_store_ps(dstp + x , f0); _mm_store_ps(dstp + x + 4, f1); _mm_store_ps(dstp + x + 8, f2); _mm_store_ps(dstp + x + 12, f3); } srcp += src_stride; dstp += dst_stride; } }
void vpx_highbd_d153_predictor_16x16_ssse3(uint16_t *dst, ptrdiff_t stride, const uint16_t *above, const uint16_t *left, int bd) { const __m128i A0 = _mm_loadu_si128((const __m128i *)(above - 1)); const __m128i A1 = _mm_loadu_si128((const __m128i *)(above + 7)); const __m128i B0 = _mm_alignr_epi8(A1, A0, 2); const __m128i B1 = _mm_srli_si128(A1, 2); const __m128i C0 = _mm_alignr_epi8(A1, A0, 4); const __m128i C1 = _mm_srli_si128(A1, 4); const __m128i avg3_0 = avg3_epu16(&A0, &B0, &C0); const __m128i avg3_1 = avg3_epu16(&A1, &B1, &C1); const __m128i L0 = _mm_load_si128((const __m128i *)left); const __m128i L1 = _mm_load_si128((const __m128i *)(left + 8)); const __m128i XL0 = _mm_alignr_epi8(L0, _mm_slli_si128(A0, 14), 14); const __m128i AXL0 = _mm_alignr_epi8(XL0, _mm_slli_si128(A0, 12), 14); const __m128i XL1 = _mm_alignr_epi8(L1, L0, 14); const __m128i AXL1 = _mm_alignr_epi8(L1, L0, 12); const __m128i avg3_left_0 = avg3_epu16(&L0, &XL0, &AXL0); const __m128i avg2_left_0 = _mm_avg_epu16(L0, XL0); const __m128i avg3_left_1 = avg3_epu16(&L1, &XL1, &AXL1); const __m128i avg2_left_1 = _mm_avg_epu16(L1, XL1); __m128i row_0 = avg3_0; __m128i row_1 = avg3_1; __m128i avg2_avg3_left[2][2]; int i, j; (void)bd; avg2_avg3_left[0][0] = _mm_unpacklo_epi16(avg2_left_0, avg3_left_0); avg2_avg3_left[0][1] = _mm_unpackhi_epi16(avg2_left_0, avg3_left_0); avg2_avg3_left[1][0] = _mm_unpacklo_epi16(avg2_left_1, avg3_left_1); avg2_avg3_left[1][1] = _mm_unpackhi_epi16(avg2_left_1, avg3_left_1); for (j = 0; j < 2; ++j) { for (i = 0; i < 2; ++i) { const __m128i avg2_avg3 = avg2_avg3_left[j][i]; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 12), 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 8), 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, _mm_slli_si128(avg2_avg3, 4), 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; row_1 = _mm_alignr_epi8(row_1, row_0, 12); row_0 = _mm_alignr_epi8(row_0, avg2_avg3, 12); _mm_store_si128((__m128i *)dst, row_0); _mm_store_si128((__m128i *)(dst + 8), row_1); dst += stride; } } }
void __stdcall rgb32_to_planar_shader_3_f16c(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* buff) noexcept { const uint8_t* s = srcp[0] + (height - 1) * spitch; // map r to Y, g to U, b to V uint8_t* dr = dstp[0]; uint8_t* dg = dstp[1]; uint8_t* db = dstp[2]; float* bb = reinterpret_cast<float*>(buff); float* bg = bb + ((width + 7) & ~7); // must be aligned 32 bytes float* br = bg + ((width + 7) & ~7); // must be aligned 32 bytes const __m128 rcp = _mm_set1_ps(1.0f / 255); const __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 8) { __m128i s0 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 0)); __m128i s1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 8)); __m128i s2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 16)); __m128i s3 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(s + 4 * x + 24)); s0 = _mm_unpacklo_epi8(s0, s1); s1 = _mm_unpacklo_epi8(s2, s3); s2 = _mm_unpacklo_epi16(s0, s1); s3 = _mm_unpackhi_epi16(s0, s1); s0 = _mm_unpacklo_epi8(s2, s3); s1 = _mm_unpackhi_epi8(s2, s3); __m128i r = _mm_unpacklo_epi8(s1, zero); s3 = _mm_unpacklo_epi16(r, zero); _mm_store_ps(br + x + 0, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3))); s3 = _mm_unpackhi_epi16(r, zero); _mm_store_ps(br + x + 4, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3))); __m128i b = _mm_unpacklo_epi8(s0, zero); __m128i g = _mm_unpackhi_epi8(s0, zero); s3 = _mm_unpacklo_epi16(b, zero); _mm_store_ps(bb + x + 0, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3))); s3 = _mm_unpackhi_epi16(b, zero); _mm_store_ps(bb + x + 4, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3))); s3 = _mm_unpacklo_epi16(g, zero); _mm_store_ps(bg + x + 0, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3))); s3 = _mm_unpackhi_epi16(g, zero); _mm_store_ps(bg + x + 4, _mm_mul_ps(rcp, _mm_cvtepi32_ps(s3))); } convert_float_to_half(dr, br, width); convert_float_to_half(dg, bg, width); convert_float_to_half(db, bb, width); s -= spitch; dr += dpitch; dg += dpitch; db += dpitch; } }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
SIMD_INLINE void UnpackMask32i(const uint8_t * src, uint32_t * dst, const __m128i & mask) { __m128i s = _mm_and_si128(mask, _mm_loadu_si128((__m128i*)src)); __m128i lo = _mm_unpacklo_epi8(s, _mm_setzero_si128()); _mm_storeu_si128((__m128i*)dst + 0, _mm_unpacklo_epi16(lo, _mm_setzero_si128())); _mm_storeu_si128((__m128i*)dst + 1, _mm_unpackhi_epi16(lo, _mm_setzero_si128())); __m128i hi = _mm_unpackhi_epi8(s, _mm_setzero_si128()); _mm_storeu_si128((__m128i*)dst + 2, _mm_unpacklo_epi16(hi, _mm_setzero_si128())); _mm_storeu_si128((__m128i*)dst + 3, _mm_unpackhi_epi16(hi, _mm_setzero_si128())); }
static inline void desc_to_olflags_v(__m128i descs[4], uint8_t vlan_flags, struct rte_mbuf **rx_pkts) { __m128i ptype0, ptype1, vtag0, vtag1; union { uint16_t e[4]; uint64_t dword; } vol; /* mask everything except rss type */ const __m128i rsstype_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, 0x000F, 0x000F, 0x000F, 0x000F); /* map rss type to rss hash flag */ const __m128i rss_flags = _mm_set_epi8(PKT_RX_FDIR, 0, 0, 0, 0, 0, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, 0, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, PKT_RX_RSS_HASH, 0); /* mask everything except vlan present bit */ const __m128i vlan_msk = _mm_set_epi16( 0x0000, 0x0000, 0x0000, 0x0000, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP, IXGBE_RXD_STAT_VP); /* map vlan present (0x8) to ol_flags */ const __m128i vlan_map = _mm_set_epi8( 0, 0, 0, 0, 0, 0, 0, vlan_flags, 0, 0, 0, 0, 0, 0, 0, 0); ptype0 = _mm_unpacklo_epi16(descs[0], descs[1]); ptype1 = _mm_unpacklo_epi16(descs[2], descs[3]); vtag0 = _mm_unpackhi_epi16(descs[0], descs[1]); vtag1 = _mm_unpackhi_epi16(descs[2], descs[3]); ptype0 = _mm_unpacklo_epi32(ptype0, ptype1); ptype0 = _mm_and_si128(ptype0, rsstype_msk); ptype0 = _mm_shuffle_epi8(rss_flags, ptype0); vtag1 = _mm_unpacklo_epi32(vtag0, vtag1); vtag1 = _mm_and_si128(vtag1, vlan_msk); vtag1 = _mm_shuffle_epi8(vlan_map, vtag1); vtag1 = _mm_or_si128(ptype0, vtag1); vol.dword = _mm_cvtsi128_si64(vtag1); rx_pkts[0]->ol_flags = vol.e[0]; rx_pkts[1]->ol_flags = vol.e[1]; rx_pkts[2]->ol_flags = vol.e[2]; rx_pkts[3]->ol_flags = vol.e[3]; }
template <bool align> SIMD_INLINE void HogDirectionHistograms(const __m128i & t, const __m128i & l, const __m128i & r, const __m128i & b, Buffer & buffer, size_t col) { HogDirectionHistograms<align>( _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(r, K_ZERO), _mm_unpacklo_epi16(l, K_ZERO))), _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpacklo_epi16(b, K_ZERO), _mm_unpacklo_epi16(t, K_ZERO))), buffer, col + 0); HogDirectionHistograms<align>( _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(r, K_ZERO), _mm_unpackhi_epi16(l, K_ZERO))), _mm_cvtepi32_ps(_mm_sub_epi32(_mm_unpackhi_epi16(b, K_ZERO), _mm_unpackhi_epi16(t, K_ZERO))), buffer, col + 4); }
template <bool align> SIMD_INLINE void GrayToBgra(uint8_t * bgra, __m128i gray, __m128i alpha) { __m128i bgLo = _mm_unpacklo_epi8(gray, gray); __m128i bgHi = _mm_unpackhi_epi8(gray, gray); __m128i raLo = _mm_unpacklo_epi8(gray, alpha); __m128i raHi = _mm_unpackhi_epi8(gray, alpha); Store<align>((__m128i*)bgra + 0, _mm_unpacklo_epi16(bgLo, raLo)); Store<align>((__m128i*)bgra + 1, _mm_unpackhi_epi16(bgLo, raLo)); Store<align>((__m128i*)bgra + 2, _mm_unpacklo_epi16(bgHi, raHi)); Store<align>((__m128i*)bgra + 3, _mm_unpackhi_epi16(bgHi, raHi)); }
/*Transpose 8 vectors with 8 16-bit values.*/ OD_SIMD_INLINE void od_transpose16x8(__m128i *t0, __m128i *t1, __m128i *t2, __m128i *t3, __m128i *t4, __m128i *t5, __m128i *t6, __m128i *t7) { __m128i a0; __m128i b0; __m128i c0; __m128i d0; __m128i e0; __m128i f0; __m128i g0; __m128i h0; __m128i a1; __m128i b1; __m128i c1; __m128i d1; __m128i e1; __m128i f1; __m128i g1; __m128i h1; /*00112233*/ a0 = _mm_unpacklo_epi16(*t0, *t1); b0 = _mm_unpacklo_epi16(*t2, *t3); c0 = _mm_unpacklo_epi16(*t4, *t5); d0 = _mm_unpacklo_epi16(*t6, *t7); /*44556677*/ e0 = _mm_unpackhi_epi16(*t0, *t1); f0 = _mm_unpackhi_epi16(*t2, *t3); g0 = _mm_unpackhi_epi16(*t4, *t5); h0 = _mm_unpackhi_epi16(*t6, *t7); /*00001111*/ a1 = _mm_unpacklo_epi32(a0, b0); b1 = _mm_unpacklo_epi32(c0, d0); /*22223333*/ c1 = _mm_unpackhi_epi32(a0, b0); d1 = _mm_unpackhi_epi32(c0, d0); /*44445555*/ e1 = _mm_unpacklo_epi32(e0, f0); f1 = _mm_unpacklo_epi32(g0, h0); /*66667777*/ g1 = _mm_unpackhi_epi32(e0, f0); h1 = _mm_unpackhi_epi32(g0, h0); *t0 = _mm_unpacklo_epi64(a1, b1); *t1 = _mm_unpackhi_epi64(a1, b1); *t2 = _mm_unpacklo_epi64(c1, d1); *t3 = _mm_unpackhi_epi64(c1, d1); *t4 = _mm_unpacklo_epi64(e1, f1); *t5 = _mm_unpackhi_epi64(e1, f1); *t6 = _mm_unpacklo_epi64(g1, h1); *t7 = _mm_unpackhi_epi64(g1, h1); }
static WEBP_INLINE void ConvertRGBToY_SSE41(const __m128i* const R, const __m128i* const G, const __m128i* const B, __m128i* const Y) { const __m128i kRG_y = MK_CST_16(16839, 33059 - 16384); const __m128i kGB_y = MK_CST_16(16384, 6420); const __m128i kHALF_Y = _mm_set1_epi32((16 << YUV_FIX) + YUV_HALF); const __m128i RG_lo = _mm_unpacklo_epi16(*R, *G); const __m128i RG_hi = _mm_unpackhi_epi16(*R, *G); const __m128i GB_lo = _mm_unpacklo_epi16(*G, *B); const __m128i GB_hi = _mm_unpackhi_epi16(*G, *B); TRANSFORM(RG_lo, RG_hi, GB_lo, GB_hi, kRG_y, kGB_y, kHALF_Y, YUV_FIX, *Y); }
static void GF_FUNC_ALIGN VS_CC proc_8bit(int radius, float *kernel, const uint8_t *srcp, float *buff, float *dstp, int width, int height, int src_stride, int dst_stride) { int length = radius * 2 + 1; const uint8_t *p[17]; for (int i = -radius; i <= radius; i++) { p[i + radius] = srcp + abs(i) * src_stride; } __m128i zero = _mm_setzero_si128(); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128 sum[4]; sum[0] = _mm_setzero_ps(); sum[1] = _mm_setzero_ps(); sum[2] = _mm_setzero_ps(); sum[3] = _mm_setzero_ps(); for (int i = 0; i < length; i++) { __m128 f[4]; __m128i xmm0 = _mm_load_si128((__m128i *)(p[i] + x)); __m128i xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); f[0] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero)); f[1] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero)); f[2] = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm1, zero)); f[3] = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm1, zero)); __m128 k = _mm_set1_ps(kernel[i]); for (int j = 0; j < 4; j++) { sum[j] = _mm_add_ps(sum[j], _mm_mul_ps(k, f[j])); } } _mm_store_ps(buff + x, sum[0]); _mm_store_ps(buff + x + 4, sum[1]); _mm_store_ps(buff + x + 8, sum[2]); _mm_store_ps(buff + x + 12, sum[3]); } proc_horizontal(buff, radius, length, width, kernel, dstp); for (int i = 0; i < length - 1; i++) { p[i] = p[i + 1]; } p[length - 1] += (y < height - radius - 1 ? 1 : -1) * src_stride; dstp += dst_stride; } }
void conv_Short1ToFloat2(void* dst, const void* s, s32 numSamples) { LSfloat* d = reinterpret_cast<LSfloat*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 3; //8個のshortをまとめて処理 s32 offset = num << 3; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f); const LSshort* p = src; LSfloat* q = d; for(s32 i=0; i<num; ++i){ __m128i t = _mm_loadu_si128((const __m128i*)p); __m128i s16_0 = _mm_unpackhi_epi16(t, t); __m128i s16_1 = _mm_unpacklo_epi16(t, t); __m128i t1 = _mm_cmpgt_epi16(izero, s16_0); __m128i t2 = _mm_cmpgt_epi16(izero, s16_1); __m128i s32_0 = _mm_unpackhi_epi16(s16_0, t1); __m128i s32_1 = _mm_unpacklo_epi16(s16_0, t1); __m128i s32_2 = _mm_unpackhi_epi16(s16_1, t2); __m128i s32_3 = _mm_unpacklo_epi16(s16_1, t2); //32bit浮動小数点に変換 __m128 f32_0 = _mm_mul_ps(_mm_cvtepi32_ps(s32_0), fcoff); __m128 f32_1 = _mm_mul_ps(_mm_cvtepi32_ps(s32_1), fcoff); __m128 f32_2 = _mm_mul_ps(_mm_cvtepi32_ps(s32_2), fcoff); __m128 f32_3 = _mm_mul_ps(_mm_cvtepi32_ps(s32_3), fcoff); _mm_storeu_ps((q+0), f32_3); _mm_storeu_ps((q+4), f32_2); _mm_storeu_ps((q+8), f32_1); _mm_storeu_ps((q+12), f32_0); p += 8; q += 16; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; q[j+0] = toFloat(p[i]); q[j+1] = toFloat(p[i]); } }
inline static short sse3_dot_prod (const uint16_t *p1, const uint16_t *p2, size_t size) { unsigned long res[4]; unsigned int i; __m128i* mp1 = (__m128i *)p1; __m128i* mp2 = (__m128i *)p2; __m128i mres = _mm_set_epi32 (0, 0, 0, 0); for (i = 0; i < size; i += 8) { __m128i mreg1 = _mm_loadu_si128 (mp1); __m128i mreg2 = _mm_loadu_si128 (mp2); __m128i xlo1 = _mm_unpacklo_epi16 (mreg1, _mm_set1_epi16 (0)); __m128i xlo2 = _mm_unpacklo_epi16 (mreg2, _mm_set1_epi16 (0)); __m128i mtmp = _mm_mullo_epi32 (xlo1, xlo2); mres = _mm_add_epi32 (mres, mtmp); __m128i xhi1 = _mm_unpackhi_epi16 (mreg1, _mm_set1_epi16 (0)); __m128i xhi2 = _mm_unpackhi_epi16 (mreg2, _mm_set1_epi16 (0)); mtmp = _mm_mullo_epi32 (xhi1, xhi2); mres = _mm_add_epi32 (mres, mtmp); /* __m128i xlo1 = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0)); __m128i xlo2 = _mm_unpacklo_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0)); __m128i mtmp = _mm_mullo_epi32 (xlo1, xlo2); mres = _mm_add_epi32 (mres, mtmp); __m128i xhi1 = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp1), _mm_set1_epi16 (0)); __m128i xhi2 = _mm_unpackhi_epi16 (_mm_loadu_si128 (mp2), _mm_set1_epi16 (0)); mtmp = _mm_mullo_epi32 (xhi1, xhi2); mres = _mm_add_epi32 (mres, mtmp); */ mp1++; mp2++; } __m128i* pmres = (__m128i *)res; _mm_storeu_si128 (pmres, mres); return res[0]+res[1]+res[2]+res[3]; }
static inline void yuv_to_planar_shader_3(uint8_t** dstp, const uint8_t** srcp, const int dpitch, const int spitch, const int width, const int height, void* _buff) noexcept { const __m128i zero = _mm_setzero_si128(); const __m128 rcp = _mm_set1_ps(1.0f / (STACK16 ? 65535 : 255)); float* buff = reinterpret_cast<float*>(_buff); for (int p = 0; p < 3; ++p) { const uint8_t* s = srcp[p]; const uint8_t* lsb = s + height * spitch; uint8_t* d = dstp[p]; for (int y = 0; y < height; ++y) { for (int x = 0; x < width; x += 16) { __m128i msbx = _mm_load_si128(reinterpret_cast<const __m128i*>(s + x)); __m128i d0, lsbx; if (!STACK16) { d0 = _mm_unpacklo_epi8(msbx, zero); } else { lsbx = _mm_load_si128(reinterpret_cast<const __m128i*>(lsb + x)); d0 = _mm_unpacklo_epi8(lsbx, msbx); } __m128 f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero)); _mm_store_ps(buff + x + 0, _mm_mul_ps(rcp, f0)); f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero)); _mm_store_ps(buff + x + 4, _mm_mul_ps(rcp, f0)); if (!STACK16) { d0 = _mm_unpackhi_epi8(msbx, zero); } else { d0 = _mm_unpackhi_epi8(lsbx, msbx); } f0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(d0, zero)); _mm_store_ps(buff + x + 8, _mm_mul_ps(rcp, f0)); f0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(d0, zero)); _mm_store_ps(buff + x + 12, _mm_mul_ps(rcp, f0)); } convert_float_to_half(d, buff, width); s += spitch; d += dpitch; if (STACK16) { lsb += spitch; } } } }
//---------------------------------------------------------------------------- void conv_Short1ToFloat1(void* dst, const void* s, s32 numSamples) { LSfloat* d = reinterpret_cast<LSfloat*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 3; //8個のshortをまとめて処理 s32 offset = num << 3; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); const __m128 fcoff = _mm_set1_ps(1.0f/32767.0f); const LSshort* p = src; LSfloat* q = d; for(s32 i=0; i<num; ++i){ //32bit浮動小数点r0, r1に変換 __m128i t0 = _mm_loadu_si128((const __m128i*)p); __m128i t1 = _mm_cmpgt_epi16(izero, t0); __m128 r0 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(t0, t1)); __m128 r1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(t0, t1)); r0 = _mm_mul_ps(r0, fcoff); r1 = _mm_mul_ps(r1, fcoff); _mm_storeu_ps((q+0), r1); _mm_storeu_ps((q+4), r0); p += 8; q += 8; } for(s32 i=0; i<rem; ++i){ q[i] = toFloat(p[i]); } }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int16_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int16_t *idata = (int16_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi16(1); const __m128i OFFSET = _mm_set1_epi16(1 << (active_bits - 1)); const __m128i SHUF = _mm_set_epi8(15,14, 11,10, 7,6, 3,2, 13,12, 9,8, 5,4, 1,0); const __m128i CLIP = _mm_set1_epi16((1 << active_bits) - 1); const __m128i ZERO = _mm_set1_epi16(0); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 16) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D8 = _mm_load_si128((__m128i *)&idata[y*istride + x + 8]); D0 = _mm_shuffle_epi8(D0, SHUF); D8 = _mm_shuffle_epi8(D8, SHUF); __m128i E0 = _mm_unpacklo_epi64(D0, D8); __m128i O1 = _mm_unpackhi_epi64(D0, D8); __m128i X0 = _mm_sub_epi16(E0, _mm_srai_epi16(_mm_add_epi16(O1, ONE), 1)); __m128i X1 = _mm_add_epi16(O1, X0); __m128i Z0 = _mm_unpacklo_epi16(X0, X1); __m128i Z8 = _mm_unpackhi_epi16(X0, X1); if (shift != 0) { Z0 = _mm_add_epi16(Z0, ONE); Z8 = _mm_add_epi16(Z8, ONE); Z0 = _mm_srai_epi16(Z0, shift); Z8 = _mm_srai_epi16(Z8, shift); } Z0 = _mm_add_epi16(Z0, OFFSET); Z8 = _mm_add_epi16(Z8, OFFSET); Z0 = _mm_min_epi16(Z0, CLIP); Z8 = _mm_min_epi16(Z8, CLIP); Z0 = _mm_max_epi16(Z0, ZERO); Z8 = _mm_max_epi16(Z8, ZERO); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 0 - ooffset_x)], Z0); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x + 8 - ooffset_x)], Z8); } } }
static void ConvertBGRAToRGBA(const uint32_t* src, int num_pixels, uint8_t* dst) { const __m128i* in = (const __m128i*)src; __m128i* out = (__m128i*)dst; while (num_pixels >= 8) { const __m128i bgra0 = _mm_loadu_si128(in++); // bgra0|bgra1|bgra2|bgra3 const __m128i bgra4 = _mm_loadu_si128(in++); // bgra4|bgra5|bgra6|bgra7 const __m128i v0l = _mm_unpacklo_epi8(bgra0, bgra4); // b0b4g0g4r0r4a0a4... const __m128i v0h = _mm_unpackhi_epi8(bgra0, bgra4); // b2b6g2g6r2r6a2a6... const __m128i v1l = _mm_unpacklo_epi8(v0l, v0h); // b0b2b4b6g0g2g4g6... const __m128i v1h = _mm_unpackhi_epi8(v0l, v0h); // b1b3b5b7g1g3g5g7... const __m128i v2l = _mm_unpacklo_epi8(v1l, v1h); // b0...b7 | g0...g7 const __m128i v2h = _mm_unpackhi_epi8(v1l, v1h); // r0...r7 | a0...a7 const __m128i ga0 = _mm_unpackhi_epi64(v2l, v2h); // g0...g7 | a0...a7 const __m128i rb0 = _mm_unpacklo_epi64(v2h, v2l); // r0...r7 | b0...b7 const __m128i rg0 = _mm_unpacklo_epi8(rb0, ga0); // r0g0r1g1 ... r6g6r7g7 const __m128i ba0 = _mm_unpackhi_epi8(rb0, ga0); // b0a0b1a1 ... b6a6b7a7 const __m128i rgba0 = _mm_unpacklo_epi16(rg0, ba0); // rgba0|rgba1... const __m128i rgba4 = _mm_unpackhi_epi16(rg0, ba0); // rgba4|rgba5... _mm_storeu_si128(out++, rgba0); _mm_storeu_si128(out++, rgba4); num_pixels -= 8; } // left-overs VP8LConvertBGRAToRGBA_C((const uint32_t*)in, num_pixels, (uint8_t*)out); }
static long conv_rgba16_rgbaF (const uint16_t *src, float *dst, long samples) { long i = 0; if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) { long n = (samples / 2) * 2; const __m128i *s = (const __m128i*) src; __v4sf *d = (__v4sf*) dst; for (; i < n / 2; i++) { /* Expand shorts to ints by loading zero in the high bits */ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); /* Convert to float */ const __m128 u0 = _mm_cvtepi32_ps (t0); const __m128 u1 = _mm_cvtepi32_ps (t1); const __v4sf rgba0 = u0 * u16_float; const __v4sf rgba1 = u1 * u16_float; d[2 * i + 0] = rgba0; d[2 * i + 1] = rgba1; } _mm_empty(); } for (i *= 2 * 4; i != 4 * samples; i++) dst[i] = src[i] * (1.f / 65535); return samples; }
/* Routine optimized for unshuffling a buffer for a type size of 4 bytes. */ static void unshuffle4_sse2(uint8_t* const dest, const uint8_t* const src, const size_t vectorizable_elements, const size_t total_elements) { static const size_t bytesoftype = 4; size_t i; int j; __m128i xmm0[4], xmm1[4]; for (i = 0; i < vectorizable_elements; i += sizeof(__m128i)) { /* Load 16 elements (64 bytes) into 4 XMM registers. */ const uint8_t* const src_for_ith_element = src + i; for (j = 0; j < 4; j++) { xmm0[j] = _mm_loadu_si128((__m128i*)(src_for_ith_element + (j * total_elements))); } /* Shuffle bytes */ for (j = 0; j < 2; j++) { /* Compute the low 32 bytes */ xmm1[j] = _mm_unpacklo_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); /* Compute the hi 32 bytes */ xmm1[2 + j] = _mm_unpackhi_epi8(xmm0[j * 2], xmm0[j * 2 + 1]); } /* Shuffle 2-byte words */ for (j = 0; j < 2; j++) { /* Compute the low 32 bytes */ xmm0[j] = _mm_unpacklo_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); /* Compute the hi 32 bytes */ xmm0[2 + j] = _mm_unpackhi_epi16(xmm1[j * 2], xmm1[j * 2 + 1]); } /* Store the result vectors in proper order */ _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (0 * sizeof(__m128i))), xmm0[0]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (1 * sizeof(__m128i))), xmm0[2]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (2 * sizeof(__m128i))), xmm0[1]); _mm_storeu_si128((__m128i*)(dest + (i * bytesoftype) + (3 * sizeof(__m128i))), xmm0[3]); } }
int aom_satd_sse2(const tran_low_t *coeff, int length) { int i; const __m128i zero = _mm_setzero_si128(); __m128i accum = zero; for (i = 0; i < length; i += 8) { const __m128i src_line = load_tran_low(coeff); const __m128i inv = _mm_sub_epi16(zero, src_line); const __m128i abs = _mm_max_epi16(src_line, inv); // abs(src_line) const __m128i abs_lo = _mm_unpacklo_epi16(abs, zero); const __m128i abs_hi = _mm_unpackhi_epi16(abs, zero); const __m128i sum = _mm_add_epi32(abs_lo, abs_hi); accum = _mm_add_epi32(accum, sum); coeff += 8; } { // cascading summation of accum __m128i hi = _mm_srli_si128(accum, 8); accum = _mm_add_epi32(accum, hi); hi = _mm_srli_epi64(accum, 32); accum = _mm_add_epi32(accum, hi); } return _mm_cvtsi128_si32(accum); }
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1; __m128i tmp; const __m128i zero = _mm_setzero_si128(); in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in1 = _mm_unpacklo_epi64( in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); in0 = _mm_unpacklo_epi64( in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); tmp = _mm_add_epi16(in0, in1); in0 = _mm_unpacklo_epi16(zero, tmp); in1 = _mm_unpackhi_epi16(zero, tmp); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); tmp = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(tmp, zero); in1 = _mm_unpackhi_epi32(tmp, zero); tmp = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(tmp, 8); in1 = _mm_add_epi32(tmp, in0); in0 = _mm_slli_epi32(in1, 1); store_output(&in0, output); }
void SoundSSE::unpack_16bit_stereo(short *input, int size, float *output[2]) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/8)*8; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/32768.0f); for (int i = 0; i < sse_size; i+=8) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); __m128 tmp0, tmp1; tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0)); tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1)); _mm_storeu_ps(output[0]+i/2, tmp0); _mm_storeu_ps(output[1]+i/2, tmp1); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i+=2) { output[0][i/2] = ((float) input[i]) / 32767.0f; output[1][i/2] = ((float) input[i+1]) / 32767.0f; } }
void SoundSSE::unpack_16bit_mono(short *input, int size, float *output) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/8)*8; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/32767.0f); for (int i = 0; i < sse_size; i+=8) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); _mm_storeu_ps(output+i+0, samples0); _mm_storeu_ps(output+i+4, samples1); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i++) { output[i] = ((float) input[i]) / 32767.0f; } }