static inline __m128i local_abs_epi32(__m128i val) { __m128i mask = _mm_srai_epi32(val, 31); val = _mm_xor_si128(val, mask); val = _mm_sub_epi32(val, mask); return val; }
void SoundSSE::unpack_16bit_mono(short *input, int size, float *output) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/8)*8; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/32767.0f); for (int i = 0; i < sse_size; i+=8) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); _mm_storeu_ps(output+i+0, samples0); _mm_storeu_ps(output+i+4, samples1); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i++) { output[i] = ((float) input[i]) / 32767.0f; } }
void SoundSSE::unpack_16bit_stereo(short *input, int size, float *output[2]) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/8)*8; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/32768.0f); for (int i = 0; i < sse_size; i+=8) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); __m128 tmp0, tmp1; tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0)); tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1)); _mm_storeu_ps(output[0]+i/2, tmp0); _mm_storeu_ps(output[1]+i/2, tmp1); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i+=2) { output[0][i/2] = ((float) input[i]) / 32767.0f; output[1][i/2] = ((float) input[i+1]) / 32767.0f; } }
void av1_fdct4x4_1_sse2(const int16_t *input, tran_low_t *output, int stride) { __m128i in0, in1; __m128i tmp; const __m128i zero = _mm_setzero_si128(); in0 = _mm_loadl_epi64((const __m128i *)(input + 0 * stride)); in1 = _mm_loadl_epi64((const __m128i *)(input + 1 * stride)); in1 = _mm_unpacklo_epi64( in1, _mm_loadl_epi64((const __m128i *)(input + 2 * stride))); in0 = _mm_unpacklo_epi64( in0, _mm_loadl_epi64((const __m128i *)(input + 3 * stride))); tmp = _mm_add_epi16(in0, in1); in0 = _mm_unpacklo_epi16(zero, tmp); in1 = _mm_unpackhi_epi16(zero, tmp); in0 = _mm_srai_epi32(in0, 16); in1 = _mm_srai_epi32(in1, 16); tmp = _mm_add_epi32(in0, in1); in0 = _mm_unpacklo_epi32(tmp, zero); in1 = _mm_unpackhi_epi32(tmp, zero); tmp = _mm_add_epi32(in0, in1); in0 = _mm_srli_si128(tmp, 8); in1 = _mm_add_epi32(tmp, in0); in0 = _mm_slli_epi32(in1, 1); store_output(&in0, output); }
void interpolate_gint16_linear_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, t1, t2; __m128i f = _mm_set_epi64x (0, *((gint64 *) ic)); const gint16 *c[2] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride) }; f = _mm_unpacklo_epi32 (f, f); f = _mm_unpacklo_epi64 (f, f); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); t1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f); t2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f); t1 = _mm_add_epi32 (t1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t2 = _mm_add_epi32 (t2, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); t1 = _mm_srai_epi32 (t1, PRECISION_S16); t2 = _mm_srai_epi32 (t2, PRECISION_S16); t1 = _mm_packs_epi32 (t1, t2); _mm_store_si128 ((__m128i *) (o + i), t1); } }
static void FTransformWHT(const int16_t* in, int16_t* out) { int32_t tmp[16]; int i; for (i = 0; i < 4; ++i, in += 64) { const int a0 = (in[0 * 16] + in[2 * 16]); const int a1 = (in[1 * 16] + in[3 * 16]); const int a2 = (in[1 * 16] - in[3 * 16]); const int a3 = (in[0 * 16] - in[2 * 16]); tmp[0 + i * 4] = a0 + a1; tmp[1 + i * 4] = a3 + a2; tmp[2 + i * 4] = a3 - a2; tmp[3 + i * 4] = a0 - a1; } { const __m128i src0 = _mm_loadu_si128((__m128i*)&tmp[0]); const __m128i src1 = _mm_loadu_si128((__m128i*)&tmp[4]); const __m128i src2 = _mm_loadu_si128((__m128i*)&tmp[8]); const __m128i src3 = _mm_loadu_si128((__m128i*)&tmp[12]); const __m128i a0 = _mm_add_epi32(src0, src2); const __m128i a1 = _mm_add_epi32(src1, src3); const __m128i a2 = _mm_sub_epi32(src1, src3); const __m128i a3 = _mm_sub_epi32(src0, src2); const __m128i b0 = _mm_srai_epi32(_mm_add_epi32(a0, a1), 1); const __m128i b1 = _mm_srai_epi32(_mm_add_epi32(a3, a2), 1); const __m128i b2 = _mm_srai_epi32(_mm_sub_epi32(a3, a2), 1); const __m128i b3 = _mm_srai_epi32(_mm_sub_epi32(a0, a1), 1); const __m128i out0 = _mm_packs_epi32(b0, b1); const __m128i out1 = _mm_packs_epi32(b2, b3); _mm_storeu_si128((__m128i*)&out[0], out0); _mm_storeu_si128((__m128i*)&out[8], out1); } }
static void transClipPixel(uint32_t *src, int src_stride, __m128i *u, int bd) { __m128i v0, v1; __m128i rnd = _mm_set1_epi32(1 << (FILTER_BITS - 1)); u[0] = _mm_loadu_si128((__m128i const *)src); u[1] = _mm_loadu_si128((__m128i const *)(src + src_stride)); u[2] = _mm_loadu_si128((__m128i const *)(src + 2 * src_stride)); u[3] = _mm_loadu_si128((__m128i const *)(src + 3 * src_stride)); u[0] = _mm_add_epi32(u[0], rnd); u[1] = _mm_add_epi32(u[1], rnd); u[2] = _mm_add_epi32(u[2], rnd); u[3] = _mm_add_epi32(u[3], rnd); u[0] = _mm_srai_epi32(u[0], FILTER_BITS); u[1] = _mm_srai_epi32(u[1], FILTER_BITS); u[2] = _mm_srai_epi32(u[2], FILTER_BITS); u[3] = _mm_srai_epi32(u[3], FILTER_BITS); u[0] = _mm_packus_epi32(u[0], u[1]); u[1] = _mm_packus_epi32(u[2], u[3]); highbd_clip(u, 2, bd); v0 = _mm_unpacklo_epi16(u[0], u[1]); v1 = _mm_unpackhi_epi16(u[0], u[1]); u[0] = _mm_unpacklo_epi16(v0, v1); u[2] = _mm_unpackhi_epi16(v0, v1); u[1] = _mm_srli_si128(u[0], 8); u[3] = _mm_srli_si128(u[2], 8); }
static inline void inner_product_gint16_cubic_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[4], t[4]; __m128i f = _mm_set_epi64x (0, *((long long *) icoeff)); const gint16 *c[4] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride), (gint16 *) ((gint8 *) b + 2 * bstride), (gint16 *) ((gint8 *) b + 3 * bstride) }; sum[0] = sum[1] = sum[2] = sum[3] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 8) { t[0] = _mm_loadu_si128 ((__m128i *) (a + i)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[0] + i)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[1] + i)))); sum[2] = _mm_add_epi32 (sum[2], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[2] + i)))); sum[3] = _mm_add_epi32 (sum[3], _mm_madd_epi16 (t[0], _mm_load_si128 ((__m128i *) (c[3] + i)))); } t[0] = _mm_unpacklo_epi32 (sum[0], sum[1]); t[1] = _mm_unpacklo_epi32 (sum[2], sum[3]); t[2] = _mm_unpackhi_epi32 (sum[0], sum[1]); t[3] = _mm_unpackhi_epi32 (sum[2], sum[3]); sum[0] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[0], t[1]), _mm_unpackhi_epi64 (t[0], t[1])); sum[2] = _mm_add_epi32 (_mm_unpacklo_epi64 (t[2], t[3]), _mm_unpackhi_epi64 (t[2], t[3])); sum[0] = _mm_add_epi32 (sum[0], sum[2]); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], f); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
static INLINE unsigned int highbd_masked_sad4xh_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)a_ptr), _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)b_ptr), _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride])); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(const uint32_t *)m_ptr), _mm_cvtsi32_si128(*(const uint32_t *)&m_ptr[m_stride])), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packs_epi32(pred_l, pred_r); const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
static INLINE unsigned int highbd_masked_sad_ssse3( const uint8_t *src8, int src_stride, const uint8_t *a8, int a_stride, const uint8_t *b8, int b_stride, const uint8_t *m_ptr, int m_stride, int width, int height) { const uint16_t *src_ptr = CONVERT_TO_SHORTPTR(src8); const uint16_t *a_ptr = CONVERT_TO_SHORTPTR(a8); const uint16_t *b_ptr = CONVERT_TO_SHORTPTR(b8); int x, y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi16((1 << AOM_BLEND_A64_ROUND_BITS)); const __m128i round_const = _mm_set1_epi32((1 << AOM_BLEND_A64_ROUND_BITS) >> 1); const __m128i one = _mm_set1_epi16(1); for (y = 0; y < height; y++) { for (x = 0; x < width; x += 8) { const __m128i src = _mm_loadu_si128((const __m128i *)&src_ptr[x]); const __m128i a = _mm_loadu_si128((const __m128i *)&a_ptr[x]); const __m128i b = _mm_loadu_si128((const __m128i *)&b_ptr[x]); // Zero-extend mask to 16 bits const __m128i m = _mm_unpacklo_epi8( _mm_loadl_epi64((const __m128i *)&m_ptr[x]), _mm_setzero_si128()); const __m128i m_inv = _mm_sub_epi16(mask_max, m); const __m128i data_l = _mm_unpacklo_epi16(a, b); const __m128i mask_l = _mm_unpacklo_epi16(m, m_inv); __m128i pred_l = _mm_madd_epi16(data_l, mask_l); pred_l = _mm_srai_epi32(_mm_add_epi32(pred_l, round_const), AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpackhi_epi16(a, b); const __m128i mask_r = _mm_unpackhi_epi16(m, m_inv); __m128i pred_r = _mm_madd_epi16(data_r, mask_r); pred_r = _mm_srai_epi32(_mm_add_epi32(pred_r, round_const), AOM_BLEND_A64_ROUND_BITS); // Note: the maximum value in pred_l/r is (2^bd)-1 < 2^15, // so it is safe to do signed saturation here. const __m128i pred = _mm_packs_epi32(pred_l, pred_r); // There is no 16-bit SAD instruction, so we have to synthesize // an 8-element SAD. We do this by storing 4 32-bit partial SADs, // and accumulating them at the end const __m128i diff = _mm_abs_epi16(_mm_sub_epi16(pred, src)); res = _mm_add_epi32(res, _mm_madd_epi16(diff, one)); } src_ptr += src_stride; a_ptr += a_stride; b_ptr += b_stride; m_ptr += m_stride; } // At this point, we have four 32-bit partial SADs stored in 'res'. res = _mm_hadd_epi32(res, res); res = _mm_hadd_epi32(res, res); int sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
template<> void cvtScale_<short, int, float>( const short* src, size_t sstep, int* dst, size_t dstep, Size size, float scale, float shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; #if CV_SSE2 if(USE_SSE2)//~5X { __m128 scale128 = _mm_set1_ps (scale); __m128 shift128 = _mm_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); r0 = _mm_cvtps_epi32(rf0); r1 = _mm_cvtps_epi32(rf1); _mm_storeu_si128((__m128i*)(dst + x), r0); _mm_storeu_si128((__m128i*)(dst + x + 4), r1); } } #endif //We will wait Haswell /* #if CV_AVX if(USE_AVX)//2X - bad variant { ////TODO:AVX implementation (optimization?) required __m256 scale256 = _mm256_set1_ps (scale); __m256 shift256 = _mm256_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { __m256i buf = _mm256_set_epi32((int)(*(src+x+7)),(int)(*(src+x+6)),(int)(*(src+x+5)),(int)(*(src+x+4)),(int)(*(src+x+3)),(int)(*(src+x+2)),(int)(*(src+x+1)),(int)(*(src+x))); __m256 r0 = _mm256_add_ps( _mm256_mul_ps(_mm256_cvtepi32_ps (buf), scale256), shift256); __m256i res = _mm256_cvtps_epi32(r0); _mm256_storeu_si256 ((__m256i*)(dst+x), res); } } #endif*/ for(; x < size.width; x++ ) dst[x] = saturate_cast<int>(src[x]*scale + shift); } }
template<int shift, int active_bits> void Haar_invtransform_H_final_1_sse4_2_int32_t(void *_idata, const int istride, const char *odata, const int ostride, const int iwidth, const int iheight, const int ooffset_x, const int ooffset_y, const int owidth, const int oheight) { int32_t *idata = (int32_t *)_idata; const int skip = 1; const __m128i ONE = _mm_set1_epi32(1); const __m128i OFFSET = _mm_set1_epi32(1 << (active_bits - 1)); (void)iwidth; (void)iheight; for (int y = ooffset_y; y < ooffset_y + oheight; y+=skip) { for (int x = ooffset_x; x < ooffset_x + owidth; x += 8) { __m128i D0 = _mm_load_si128((__m128i *)&idata[y*istride + x + 0]); __m128i D4 = _mm_load_si128((__m128i *)&idata[y*istride + x + 4]); __m128i A0 = _mm_unpacklo_epi32(D0, D4); __m128i A2 = _mm_unpackhi_epi32(D0, D4); __m128i E0 = _mm_unpacklo_epi32(A0, A2); __m128i O1 = _mm_unpackhi_epi32(A0, A2); __m128i X0 = _mm_sub_epi32(E0, _mm_srai_epi32(_mm_add_epi32(O1, ONE), 1)); __m128i X1 = _mm_add_epi32(O1, X0); __m128i Z0 = _mm_unpacklo_epi32(X0, X1); __m128i Z4 = _mm_unpackhi_epi32(X0, X1); if (shift != 0) { Z0 = _mm_add_epi32(Z0, ONE); Z4 = _mm_add_epi32(Z4, ONE); Z0 = _mm_srai_epi32(Z0, shift); Z4 = _mm_srai_epi32(Z4, shift); } Z0 = _mm_add_epi32(Z0, OFFSET); Z4 = _mm_add_epi32(Z4, OFFSET); Z0 = _mm_slli_epi32(Z0, (16 - active_bits)); Z4 = _mm_slli_epi32(Z4, (16 - active_bits)); __m128i R = _mm_packus_epi32(Z0, Z4); R = _mm_srli_epi16(R, (16 - active_bits)); _mm_store_si128((__m128i *)&odata[2*((y - ooffset_y)*ostride + x - ooffset_x)], R); } } }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, int th) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; line_copy16(p0, srcp + stride, width, 1); line_copy16(p1, srcp, width, 1); int16_t threshold = (int16_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi16(threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, srcp, width, 1); uint16_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 8) { __m128i sumlo = zero; __m128i sumhi = zero; for (int i = 0; i < 8; i++) { __m128i target = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sumlo = _mm_add_epi32(sumlo, _mm_unpacklo_epi16(target, zero)); sumhi = _mm_add_epi32(sumhi, _mm_unpackhi_epi16(target, zero)); } sumlo = _mm_srai_epi32(sumlo, 3); sumhi = _mm_srai_epi32(sumhi, 3); sumlo = mm_cast_epi32(sumlo, sumhi); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu16(src, xth); sumlo = MM_MAX_EPU16(sumlo, src); sumlo = MM_MIN_EPU16(sumlo, limit); _mm_store_si128((__m128i *)(dstp + x), sumlo); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
static inline void inner_product_gint16_linear_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i = 0; __m128i sum[2], t; __m128i f = _mm_set_epi64x (0, *((gint64 *) icoeff)); const gint16 *c[2] = { (gint16 *) ((gint8 *) b + 0 * bstride), (gint16 *) ((gint8 *) b + 1 * bstride) }; sum[0] = sum[1] = _mm_setzero_si128 (); f = _mm_unpacklo_epi16 (f, sum[0]); for (; i < len; i += 16) { t = _mm_loadu_si128 ((__m128i *) (a + i + 0)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 0)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 0)))); t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); sum[0] = _mm_add_epi32 (sum[0], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[0] + i + 8)))); sum[1] = _mm_add_epi32 (sum[1], _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (c[1] + i + 8)))); } sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[1] = _mm_srai_epi32 (sum[1], PRECISION_S16); sum[0] = _mm_madd_epi16 (sum[0], _mm_shuffle_epi32 (f, _MM_SHUFFLE (0, 0, 0, 0))); sum[1] = _mm_madd_epi16 (sum[1], _mm_shuffle_epi32 (f, _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], sum[1]); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (2, 3, 2, 3))); sum[0] = _mm_add_epi32 (sum[0], _mm_shuffle_epi32 (sum[0], _MM_SHUFFLE (1, 1, 1, 1))); sum[0] = _mm_add_epi32 (sum[0], _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum[0] = _mm_srai_epi32 (sum[0], PRECISION_S16); sum[0] = _mm_packs_epi32 (sum[0], sum[0]); *o = _mm_extract_epi16 (sum[0], 0); }
mlib_status __mlib_VectorConvert_S16_S32_Mod( mlib_s16 *z, const mlib_s32 *x, mlib_s32 n) { if (n < 1) return (MLIB_FAILURE); mlib_s32 i, ax, az, nstep, n1, n2, n3; mlib_s32 *px = (mlib_s32 *)x; mlib_s16 *pz = (mlib_s16 *)z; __m128i zbuf, xlo, xhi, mask; ax = (mlib_addr)x & 15; az = (mlib_addr)z & 15; nstep = 16 / sizeof (mlib_s16); n1 = ((16 - ax) & 15) / sizeof (mlib_s32); n2 = (n - n1) / nstep; n3 = n - n1 - n2 * nstep; if (n2 < 1) { for (i = 0; i < n; i++) { *pz++ = *px++; } } else { for (i = 0; i < n1; i++) { *pz++ = *px++; } for (i = 0; i < n2; i++) { xlo = _mm_load_si128((__m128i *)px); xhi = _mm_load_si128((__m128i *)px + 1); xlo = _mm_slli_epi32(xlo, 16); xhi = _mm_slli_epi32(xhi, 16); xlo = _mm_srai_epi32(xlo, 16); xhi = _mm_srai_epi32(xhi, 16); zbuf = _mm_packs_epi32(xlo, xhi); _mm_storeu_si128((__m128i *)pz, zbuf); px += nstep; pz += nstep; } for (i = 0; i < n3; i++) { *pz++ = *px++; } } return (MLIB_SUCCESS); }
inline FORCE_INLINE __m128i export_i30_u16(__m128i lo, __m128i hi, uint16_t limit) { const __m128i round = _mm_set1_epi32(1 << 13); lo = _mm_add_epi32(lo, round); hi = _mm_add_epi32(hi, round); lo = _mm_srai_epi32(lo, 14); hi = _mm_srai_epi32(hi, 14); lo = _mm_packs_epi32(lo, hi); return lo; }
// Unary Ops SIMDValue SIMDInt32x4Operation::OpAbs(const SIMDValue& value) { SIMDValue result; X86SIMDValue x86Result; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); if (AutoSystemInfo::Data.SSE3Available()) { x86Result.m128i_value = _mm_abs_epi32(v.m128i_value); // only available after SSE3 result = X86SIMDValue::ToSIMDValue(x86Result); } else if (AutoSystemInfo::Data.SSE2Available()) { X86SIMDValue temp, SIGNMASK; SIGNMASK.m128i_value = _mm_srai_epi32(v.m128i_value, 31); // mask = value >> 31 temp.m128i_value = _mm_xor_si128(v.m128i_value, SIGNMASK.m128i_value); // temp = value ^ mask x86Result.m128i_value = _mm_sub_epi32(temp.m128i_value, SIGNMASK.m128i_value); // temp - mask result = X86SIMDValue::ToSIMDValue(x86Result); } else { result.i32[SIMD_X] = (value.i32[SIMD_X] < 0) ? -1 * value.i32[SIMD_X] : value.i32[SIMD_X]; result.i32[SIMD_Y] = (value.i32[SIMD_Y] < 0) ? -1 * value.i32[SIMD_Y] : value.i32[SIMD_Y]; result.i32[SIMD_Z] = (value.i32[SIMD_Z] < 0) ? -1 * value.i32[SIMD_Z] : value.i32[SIMD_Z]; result.i32[SIMD_W] = (value.i32[SIMD_W] < 0) ? -1 * value.i32[SIMD_W] : value.i32[SIMD_W]; } return result; }
static inline void inner_product_gint16_full_1_sse2 (gint16 * o, const gint16 * a, const gint16 * b, gint len, const gint16 * icoeff, gint bstride) { gint i; __m128i sum, t; sum = _mm_setzero_si128 (); for (i = 0; i < len; i += 16) { t = _mm_loadu_si128 ((__m128i *) (a + i)); sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 0)))); t = _mm_loadu_si128 ((__m128i *) (a + i + 8)); sum = _mm_add_epi32 (sum, _mm_madd_epi16 (t, _mm_load_si128 ((__m128i *) (b + i + 8)))); } sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (2, 3, 2, 3))); sum = _mm_add_epi32 (sum, _mm_shuffle_epi32 (sum, _MM_SHUFFLE (1, 1, 1, 1))); sum = _mm_add_epi32 (sum, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); sum = _mm_srai_epi32 (sum, PRECISION_S16); sum = _mm_packs_epi32 (sum, sum); *o = _mm_extract_epi16 (sum, 0); }
__inline__ static void mul4(__m128i in, __m128i *out) { const __m128i shuf = _mm_set_epi8(0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15); const __m128i mask = _mm_set_epi32(135, 1, 1, 1); block intmp = _mm_shuffle_epi8(in, shuf); block tmp = _mm_srai_epi32(intmp, 31); tmp = _mm_and_si128(tmp, mask); tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3)); *out = _mm_slli_epi32(intmp, 1); *out = _mm_xor_si128(*out, tmp); tmp = _mm_srai_epi32(*out, 31); tmp = _mm_and_si128(tmp, mask); tmp = _mm_shuffle_epi32(tmp, _MM_SHUFFLE(2, 1, 0, 3)); *out = _mm_slli_epi32(*out, 1); *out = _mm_xor_si128(*out, tmp); *out = _mm_shuffle_epi8(*out, shuf); }
__m128i test_mm_srai_epi32(__m128i A) { // DAG-LABEL: test_mm_srai_epi32 // DAG: call <4 x i32> @llvm.x86.sse2.psrai.d // // ASM-LABEL: test_mm_srai_epi32 // ASM: psrad return _mm_srai_epi32(A, 1); }
void SoundSSE::unpack_8bit_stereo(unsigned char *input, int size, float *output[2]) { #ifndef CL_DISABLE_SSE2 int sse_size = (size/16)*16; __m128i zero = _mm_setzero_si128(); __m128 constant1 = _mm_set1_ps(1.0f/128.0f); __m128i constant2 = _mm_set1_epi16(128); for (int i = 0; i < sse_size; i+=16) { __m128i isamples = _mm_loadu_si128((__m128i*)(input+i)); __m128i isamples0 = _mm_sub_epi16(_mm_unpacklo_epi8(isamples, zero), constant2); __m128i isamples1 = _mm_sub_epi16(_mm_unpackhi_epi8(isamples, zero), constant2); __m128 samples0 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples0), 16)); __m128 samples1 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples0), 16)); __m128 samples2 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(zero, isamples1), 16)); __m128 samples3 = _mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpackhi_epi16(zero, isamples1), 16)); samples0 = _mm_mul_ps(samples0, constant1); samples1 = _mm_mul_ps(samples1, constant1); samples2 = _mm_mul_ps(samples2, constant1); samples3 = _mm_mul_ps(samples3, constant1); __m128 tmp0, tmp1, tmp2, tmp3; tmp0 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(2,0,2,0)); tmp1 = _mm_shuffle_ps(samples0, samples1, _MM_SHUFFLE(3,1,3,1)); tmp2 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(2,0,2,0)); tmp3 = _mm_shuffle_ps(samples2, samples3, _MM_SHUFFLE(3,1,3,1)); _mm_storeu_ps(output[0]+i/2, tmp0); _mm_storeu_ps(output[1]+i/2, tmp1); _mm_storeu_ps(output[0]+i/2+4, tmp2); _mm_storeu_ps(output[1]+i/2+4, tmp3); } #else const int sse_size = 0; #endif // unpack remaining for (int i = sse_size; i < size; i+=2) { int value = input[i]; output[0][i/2] = ((float) (value - 128)) / 128.0f; value = input[i+1]; output[1][i/2] = ((float) (value - 128)) / 128.0f; } }
static WEBP_INLINE __m128i VP8GetRGBA32b(int y, int u, int v) { const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m); const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m); const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m); const __m128i uv_part = _mm_add_epi32(u_part, v_part); const __m128i rgba1 = _mm_add_epi32(y_part, uv_part); const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2); return rgba2; }
SIMDValue SIMDInt32x4Operation::OpShiftRightArithmetic(const SIMDValue& value, int count) { X86SIMDValue x86Result; X86SIMDValue tmpValue = X86SIMDValue::ToX86SIMDValue(value); // Shifts the 4 signed 32-bit integers right by count bits while shifting in the sign bit x86Result.m128i_value = _mm_srai_epi32(tmpValue.m128i_value, count); return X86SIMDValue::ToSIMDValue(x86Result); }
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f); const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f); const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f); const __m128 _ps_pi = _mm_set1_ps(pi); const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2, mm3; __m128 axm, aym; __m128 xm = _mm_set1_ps(x); __m128 ym = _mm_set1_ps(y); axm = _mm_and_ps(xm, _mask_sign_inv); aym = _mm_and_ps(ym, _mask_sign_inv); mm1 = _mm_min_ps(axm, aym); mm2 = _mm_max_ps(axm, aym); mm1 = _mm_div_ps(mm1, mm2); mm2 = _mm_mul_ps(mm1, mm1); mm3 = _mm_mul_ps(mm2, _ps_atan_p0); mm3 = _mm_add_ps(mm3, _ps_atan_p1); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_sub_ps(mm3, _ps_atan_p2); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_mul_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm1); __m128 mask; /* |y| > |x| */ mask = _mm_cmpgt_ss(aym, axm); mm2 = _mm_and_ps(_ps_pi0p5, mask); mm1 = _mm_and_ps(_mask_sign_raw, mask); mm3 = _mm_xor_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm2); /* x < 0 */ mask = _mm_and_ps(xm, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mask); mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30)); mm1 = _mm_and_ps(_ps_pi, mm1); mm3 = _mm_add_ps(mm3, mm1); /* y < 0 */ mm1 = _mm_and_ps(ym, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mm1); return _mm_cvtss_f32(mm3); }
void interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, tl1, tl2, th1, th2; __m128i f[2]; const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride), (gint16 *) ((gint8 *) a + 2 * astride), (gint16 *) ((gint8 *) a + 3 * astride) }; f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]); f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]); th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]); ta = _mm_load_si128 ((__m128i *) (c[2] + i)); tb = _mm_load_si128 ((__m128i *) (c[3] + i)); tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]); th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]); tl1 = _mm_add_epi32 (tl1, tl2); th1 = _mm_add_epi32 (th1, th2); tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); tl1 = _mm_srai_epi32 (tl1, PRECISION_S16); th1 = _mm_srai_epi32 (th1, PRECISION_S16); tl1 = _mm_packs_epi32 (tl1, th1); _mm_store_si128 ((__m128i *) (o + i), tl1); } }
__m64 _m_psradi(__m64 _M, int _Count) { __m128i lhs = {0}; lhs.m128i_i64[0] = _M.m64_i64; lhs = _mm_srai_epi32(lhs, _Count); _M.m64_i64 = lhs.m128i_i64[0]; return _M; }
void conv_Short2ToShort1(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 2; //8個のshortをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); __declspec(align(16)) LSshort tmp[8]; const LSshort* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ //32bit整数r0, r1に変換 __m128i t0 = _mm_loadu_si128((const __m128i*)p); __m128i t1 = _mm_cmpgt_epi16(izero, t0); __m128i r0 = _mm_unpackhi_epi16(t0, t1); __m128i r1 = _mm_unpacklo_epi16(t0, t1); __m128i r2 = _mm_add_epi32(r0, _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 3, 0, 1))); __m128i r3 = _mm_add_epi32(r1, _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 3, 0, 1))); r2 = _mm_srai_epi32(r2, 1); r3 = _mm_srai_epi32(r3, 1); __m128i r4 = _mm_packs_epi32(r3, r2); _mm_store_si128((__m128i*)tmp, r4); q[0] = tmp[0]; q[1] = tmp[2]; q[2] = tmp[4]; q[3] = tmp[6]; p += 8; q += 4; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; s32 t = (p[j+0] + p[j+1]) >> 1; q[i] = static_cast<LSshort>(t); } }
inline __m128i LOAD_QUANTISED(const int32_t *idata, const QuantisationMatrix *qmatrix, const int l, const int s) { __m128i D = _mm_load_si128((__m128i *)idata); __m128i QF = _mm_load_si128((__m128i *)&qmatrix->qfactor[l][s]); __m128i QO = _mm_load_si128((__m128i *)&qmatrix->qoffset[l][s]); __m128i X = _mm_abs_epi32(D); X = _mm_mullo_epi32(X, QF); X = _mm_add_epi32(X, QO); X = _mm_srai_epi32(X, 2); X = _mm_sign_epi32(X, D); return X; }
// // This was v_mul_complex16_shift but I changed the name for consistency with v_conj_mul // and the fact that the old v_mul_complex16 was never called // FORCE_INLINE int __ext_v_mul_complex16(struct complex16* out, int lenout, struct complex16* x, int len1, struct complex16* y, int len2, int shift) { const unum8 wlen = 4;// sizeof(vcs) / sizeof(complex16); const __m128i xmm6 = _mm_set1_epi32(0x0000FFFF); const __m128i xmm5 = _mm_set1_epi32(0xFFFF0000); const __m128i xmm4 = _mm_set1_epi32(0x00010000); __m128i* Xs = (__m128i*) x; __m128i* Ys = (__m128i*) y; __m128i* Outs = (__m128i*) out; for (int i = 0; i < len1 / wlen; i++){ __m128i mx = _mm_loadu_si128(&Xs[i]); __m128i my = _mm_loadu_si128(&Ys[i]); __m128i ms1 = _mm_xor_si128(mx, xmm5); ms1 = _mm_add_epi32(ms1, xmm4); __m128i ms2 = _mm_shufflehi_epi16(mx, _MM_SHUFFLE(2, 3, 0, 1)); ms2 = _mm_shufflelo_epi16(ms2, _MM_SHUFFLE(2, 3, 0, 1)); __m128i mre = _mm_srai_epi32(_mm_madd_epi16(ms1, my), shift); __m128i mim = _mm_srai_epi32(_mm_madd_epi16(ms2, my), shift); mre = _mm_and_si128(mre,xmm6); mim = _mm_and_si128(mim,xmm6); mim = _mm_slli_epi32(mim,0x10); _mm_storeu_si128(&Outs[i], _mm_or_si128(mre, mim)); } for (int i = (len1 / wlen) * wlen; i < len1; i++){ out[i].re = (x[i].re * y[i].re - x[i].im * y[i].im) >> shift; out[i].im = (x[i].re * y[i].im + x[i].im * y[i].re) >> shift; } return 0; }
void f0r_update(f0r_instance_t instance, double time, const uint32_t *inframe, uint32_t *outframe) { assert(instance); colgate_instance_t *inst = (colgate_instance_t *)instance; unsigned len = inst->width * inst->height; unsigned char *dst = (unsigned char *)outframe; const unsigned char *src = (unsigned char *)inframe; unsigned i; #ifdef __SSE2__ __m128i zero = _mm_setzero_si128(); __m128i max = _mm_set1_epi16(REVERSE_LUT_SIZE - 1); for (i = 0; i < len; ++i) { __m128i l1 = inst->premult_r[*src++]; __m128i l2 = inst->premult_g[*src++]; __m128i l3 = inst->premult_b[*src++]; __m128i result = _mm_add_epi32(l3, _mm_add_epi32(l1, l2)); // Shift into the right range, and then clamp to [min, max]. // We convert to 16-bit values since we have min/max instructions // there (without needing SSE4), and because it allows us // to extract the values with one less SSE shift/move. result = _mm_srai_epi32(result, INPUT_PIXEL_BITS + MATRIX_ELEMENT_FRAC_BITS - REVERSE_LUT_BITS); result = _mm_packs_epi32(result, result); result = _mm_max_epi16(result, zero); result = _mm_min_epi16(result, max); unsigned new_rg = _mm_cvtsi128_si32(result); result = _mm_srli_si128(result, 4); unsigned new_b = _mm_cvtsi128_si32(result); *dst++ = linear_rgb_to_srgb_lut[new_rg & 0xffff]; *dst++ = linear_rgb_to_srgb_lut[new_rg >> 16]; *dst++ = linear_rgb_to_srgb_lut[new_b]; *dst++ = *src++; // Copy alpha. } #else for (i = 0; i < len; ++i) { unsigned old_r = *src++; unsigned old_g = *src++; unsigned old_b = *src++; int new_r = inst->premult_r[old_r][0] + inst->premult_g[old_g][0] + inst->premult_b[old_b][0]; int new_g = inst->premult_r[old_r][1] + inst->premult_g[old_g][1] + inst->premult_b[old_b][1]; int new_b = inst->premult_r[old_r][2] + inst->premult_g[old_g][2] + inst->premult_b[old_b][2]; *dst++ = convert_linear_rgb_to_srgb_fp(new_r); *dst++ = convert_linear_rgb_to_srgb_fp(new_g); *dst++ = convert_linear_rgb_to_srgb_fp(new_b); *dst++ = *src++; // Copy alpha. } #endif }