static void GF_FUNC_ALIGN VS_CC float_to_dst_8bit(const float *srcp, uint8_t *dstp, int width, int height, int src_stride, int dst_stride, float th, int bits) { __m128 tmax = _mm_set1_ps(th); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax); __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax); __m128 xmf2 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 8), tmax); __m128 xmf3 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 12), tmax); __m128i xmi0 = _mm_packs_epi32(_mm_castps_si128(xmf0), _mm_castps_si128(xmf1)); __m128i xmi1 = _mm_packs_epi32(_mm_castps_si128(xmf2), _mm_castps_si128(xmf3)); xmi0 = _mm_packs_epi16(xmi0, xmi1); _mm_store_si128((__m128i *)(dstp + x), xmi0); } srcp += src_stride; dstp += dst_stride; } }
inline FORCE_INLINE __m128i export_i30_u16(__m128i lo, __m128i hi, uint16_t limit) { const __m128i round = _mm_set1_epi32(1 << 13); lo = _mm_add_epi32(lo, round); hi = _mm_add_epi32(hi, round); lo = _mm_srai_epi32(lo, 14); hi = _mm_srai_epi32(hi, 14); lo = _mm_packs_epi32(lo, hi); return lo; }
void VP8YuvToBgra32(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst) { int n; for (n = 0; n < 32; n += 2) { const __m128i tmp0_1 = GetRGBA32b(y[n + 0], u[n + 0], v[n + 0]); const __m128i tmp0_2 = GetRGBA32b(y[n + 1], u[n + 1], v[n + 1]); const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(3, 0, 1, 2)); const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(3, 0, 1, 2)); const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); _mm_storel_epi64((__m128i*)dst, tmp3); dst += 4 * 2; } }
static void CollectColorBlueTransforms_SSE2(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_blue, int red_to_blue, int histo[]) { const __m128i mults_r = _mm_set_epi16( CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0, CST_5b(red_to_blue), 0); const __m128i mults_g = _mm_set_epi16( 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue), 0, CST_5b(green_to_blue)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask_b = _mm_set1_epi32(0x0000ff); // blue mask int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) { uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_slli_epi16(in0, 8); // r 0 | b 0 const __m128i A1 = _mm_slli_epi16(in1, 8); const __m128i B0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i B1 = _mm_and_si128(in1, mask_g); const __m128i C0 = _mm_mulhi_epi16(A0, mults_r); // x db | 0 0 const __m128i C1 = _mm_mulhi_epi16(A1, mults_r); const __m128i D0 = _mm_mulhi_epi16(B0, mults_g); // 0 0 | x db const __m128i D1 = _mm_mulhi_epi16(B1, mults_g); const __m128i E0 = _mm_sub_epi8(in0, D0); // x x | x b' const __m128i E1 = _mm_sub_epi8(in1, D1); const __m128i F0 = _mm_srli_epi32(C0, 16); // 0 0 | x db const __m128i F1 = _mm_srli_epi32(C1, 16); const __m128i G0 = _mm_sub_epi8(E0, F0); // 0 0 | x b' const __m128i G1 = _mm_sub_epi8(E1, F1); const __m128i H0 = _mm_and_si128(G0, mask_b); // 0 0 | 0 b const __m128i H1 = _mm_and_si128(G1, mask_b); const __m128i I = _mm_packs_epi32(H0, H1); // 0 b' | 0 b' _mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]]; } } { const int left_over = tile_width & (SPAN - 1); if (left_over > 0) { VP8LCollectColorBlueTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_blue, red_to_blue, histo); } } }
/* ----------------------------------- * weighted_merge_luma_yuy2 * ----------------------------------- */ static void weighted_merge_luma_yuy2_sse2(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height, int weight, int invweight) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); __m128i luma_mask = _mm_set1_epi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m128i chroma_mask = _mm_set1_epi16(0xFF00); #pragma warning(pop) int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); //V1 Y3 U1 Y2 V0 Y1 U0 Y0 __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(luma+x)); //v1 y3 u1 y2 v0 y1 u0 y0 __m128i src_lo = _mm_unpacklo_epi16(px1, px2); //v0 y1 V0 Y1 u0 y0 U0 Y0 __m128i src_hi = _mm_unpackhi_epi16(px1, px2); src_lo = _mm_and_si128(src_lo, luma_mask); //00 v0 00 V0 00 u0 00 U0 src_hi = _mm_and_si128(src_hi, luma_mask); src_lo = _mm_madd_epi16(src_lo, mask); src_hi = _mm_madd_epi16(src_hi, mask); src_lo = _mm_add_epi32(src_lo, round_mask); src_hi = _mm_add_epi32(src_hi, round_mask); src_lo = _mm_srli_epi32(src_lo, 15); src_hi = _mm_srli_epi32(src_hi, 15); __m128i result_luma = _mm_packs_epi32(src_lo, src_hi); __m128i result_chroma = _mm_and_si128(px1, chroma_mask); __m128i result = _mm_or_si128(result_chroma, result_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = wMod16; x < width; x+=2) { src[x] = (luma[x] * weight + src[x] * invweight + 16384) >> 15; } src += pitch; luma += luma_pitch; } }
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size) { #ifdef _M_SSE // Size will always be 16-byte aligned as the hwBlockSize is. while (size >= 8) { __m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i packed = _mm_packs_epi32(in1, in2); _mm_storeu_si128((__m128i *)out, packed); out += 8; in += 8; size -= 8; } #endif for (size_t i = 0; i < size; i++) out[i] = clamp_s16(in[i]); }
/* ----------------------------------- * weighted_merge_chroma_yuy2 * ----------------------------------- */ static void weighted_merge_chroma_yuy2_sse2(BYTE *src, const BYTE *chroma, int pitch, int chroma_pitch,int width, int height, int weight, int invweight ) { __m128i round_mask = _mm_set1_epi32(0x4000); __m128i mask = _mm_set_epi16(weight, invweight, weight, invweight, weight, invweight, weight, invweight); __m128i luma_mask = _mm_set1_epi16(0x00FF); int wMod16 = (width/16) * 16; for (int y = 0; y < height; y++) { for (int x = 0; x < wMod16; x += 16) { __m128i px1 = _mm_load_si128(reinterpret_cast<const __m128i*>(src+x)); __m128i px2 = _mm_load_si128(reinterpret_cast<const __m128i*>(chroma+x)); __m128i src_lo = _mm_unpacklo_epi16(px1, px2); __m128i src_hi = _mm_unpackhi_epi16(px1, px2); src_lo = _mm_srli_epi16(src_lo, 8); src_hi = _mm_srli_epi16(src_hi, 8); src_lo = _mm_madd_epi16(src_lo, mask); src_hi = _mm_madd_epi16(src_hi, mask); src_lo = _mm_add_epi32(src_lo, round_mask); src_hi = _mm_add_epi32(src_hi, round_mask); src_lo = _mm_srli_epi32(src_lo, 15); src_hi = _mm_srli_epi32(src_hi, 15); __m128i result_chroma = _mm_packs_epi32(src_lo, src_hi); result_chroma = _mm_slli_epi16(result_chroma, 8); __m128i result_luma = _mm_and_si128(px1, luma_mask); __m128i result = _mm_or_si128(result_chroma, result_luma); _mm_store_si128(reinterpret_cast<__m128i*>(src+x), result); } for (int x = wMod16; x < width; x+=2) { src[x+1] = (chroma[x+1] * weight + src[x+1] * invweight + 16384) >> 15; } src += pitch; chroma += chroma_pitch; } }
static void recon8x8(int **m7, imgpel **mb_rec, imgpel **mpr, int max_imgpel_value, int ioff) { int j; int *m_tr = NULL; imgpel *m_rec = NULL; imgpel *m_prd = NULL; __m128i mm_dq2 = _mm_set1_epi16((1<<(DQ_BITS_8-1))); __m128i mm0 = _mm_set1_epi16(0); __m128i mm7, mm72, mmPred, tmp; for( j = 0; j < 8; j++) { m_tr = (*m7++) + ioff; m_rec = (*mb_rec++) + ioff; m_prd = (*mpr++) + ioff; mm7 = _mm_loadu_si128((__m128i*) m_tr); mm72 = _mm_loadu_si128((__m128i*) (m_tr+4)); mm7 = _mm_packs_epi32(mm7, mm72); mmPred = _mm_loadu_si128((__m128i*) m_prd); mmPred = _mm_unpacklo_epi8(mmPred, mm0); tmp = _mm_add_epi16(mm7, mm_dq2); tmp = _mm_srai_epi16(tmp, DQ_BITS_8); tmp = _mm_add_epi16(tmp, mmPred); tmp = _mm_packus_epi16(tmp, tmp); _mm_storel_epi64((__m128i*) m_rec, tmp); /* *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec++ = (imgpel) iClip1(max_imgpel_value, (*m_prd++) + rshift_rnd_sf(*m_tr++, DQ_BITS_8)); *m_rec = (imgpel) iClip1(max_imgpel_value, (*m_prd ) + rshift_rnd_sf(*m_tr , DQ_BITS_8)); */ } }
void audio_convert_float_to_s16_SSE2(int16_t *out, const float *in, size_t samples) { __m128 factor = _mm_set1_ps((float)0x7fff); size_t i; for (i = 0; i + 8 <= samples; i += 8, in += 8, out += 8) { __m128 input[2] = { _mm_loadu_ps(in + 0), _mm_loadu_ps(in + 4) }; __m128 res[2] = { _mm_mul_ps(input[0], factor), _mm_mul_ps(input[1], factor) }; __m128i ints[2] = { _mm_cvtps_epi32(res[0]), _mm_cvtps_epi32(res[1]) }; __m128i packed = _mm_packs_epi32(ints[0], ints[1]); _mm_storeu_si128((__m128i *)out, packed); } audio_convert_float_to_s16_C(out, in, samples - i); }
// const __m128i mask_lsb = _mm_set1_epi16 (0x00FF); // const __m128i sign_bit = _mm_set1_epi16 (-0x8000); // const __m128 offset = _mm_set1_ps (-32768); void ProxyRwSse2 <SplFmt_STACK16>::write_flt (const Ptr::Type &ptr, const __m128 &src0, const __m128 &src1, const __m128i &mask_lsb, const __m128i &sign_bit, const __m128 &offset) { __m128 val_03_f = _mm_add_ps (src0, offset); __m128 val_47_f = _mm_add_ps (src1, offset); const __m128i val_03 = _mm_cvtps_epi32 (val_03_f); const __m128i val_47 = _mm_cvtps_epi32 (val_47_f); __m128i val = _mm_packs_epi32 (val_03, val_47); val = _mm_xor_si128 (val, sign_bit); fstb::ToolsSse2::store_8_16ml ( ptr._msb_ptr, ptr._lsb_ptr, val, mask_lsb ); }
void demod_16qam_lte_s_sse(const cf_t *symbols, short *llr, int nsymbols) { float *symbolsPtr = (float*) symbols; __m128i *resultPtr = (__m128i*) llr; __m128 symbol1, symbol2; __m128i symbol_i1, symbol_i2, symbol_i, symbol_abs; __m128i offset = _mm_set1_epi16(2*SCALE_SHORT_CONV_QAM16/sqrt(10)); __m128i result11, result12, result22, result21; __m128 scale_v = _mm_set1_ps(-SCALE_SHORT_CONV_QAM16); __m128i shuffle_negated_1 = _mm_set_epi8(0xff,0xff,0xff,0xff,7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0); __m128i shuffle_abs_1 = _mm_set_epi8(7,6,5,4,0xff,0xff,0xff,0xff,3,2,1,0,0xff,0xff,0xff,0xff); __m128i shuffle_negated_2 = _mm_set_epi8(0xff,0xff,0xff,0xff,15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8); __m128i shuffle_abs_2 = _mm_set_epi8(15,14,13,12,0xff,0xff,0xff,0xff,11,10,9,8,0xff,0xff,0xff,0xff); for (int i=0;i<nsymbols/4;i++) { symbol1 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol2 = _mm_load_ps(symbolsPtr); symbolsPtr+=4; symbol_i1 = _mm_cvtps_epi32(_mm_mul_ps(symbol1, scale_v)); symbol_i2 = _mm_cvtps_epi32(_mm_mul_ps(symbol2, scale_v)); symbol_i = _mm_packs_epi32(symbol_i1, symbol_i2); symbol_abs = _mm_abs_epi16(symbol_i); symbol_abs = _mm_sub_epi16(symbol_abs, offset); result11 = _mm_shuffle_epi8(symbol_i, shuffle_negated_1); result12 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_1); result21 = _mm_shuffle_epi8(symbol_i, shuffle_negated_2); result22 = _mm_shuffle_epi8(symbol_abs, shuffle_abs_2); _mm_store_si128(resultPtr, _mm_or_si128(result11, result12)); resultPtr++; _mm_store_si128(resultPtr, _mm_or_si128(result21, result22)); resultPtr++; } // Demodulate last symbols for (int i=4*(nsymbols/4);i<nsymbols;i++) { short yre = (short) (SCALE_SHORT_CONV_QAM16*crealf(symbols[i])); short yim = (short) (SCALE_SHORT_CONV_QAM16*cimagf(symbols[i])); llr[4*i+0] = -yre; llr[4*i+1] = -yim; llr[4*i+2] = abs(yre)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); llr[4*i+3] = abs(yim)-2*SCALE_SHORT_CONV_QAM16/sqrt(10); } }
static int ExtractAlpha(const uint8_t* argb, int argb_stride, int width, int height, uint8_t* alpha, int alpha_stride) { // alpha_and stores an 'and' operation of all the alpha[] values. The final // value is not 0xff if any of the alpha[] is not equal to 0xff. uint32_t alpha_and = 0xff; int i, j; const __m128i a_mask = _mm_set1_epi32(0xffu); // to preserve alpha const __m128i all_0xff = _mm_set_epi32(0, 0, ~0u, ~0u); __m128i all_alphas = all_0xff; // We must be able to access 3 extra bytes after the last written byte // 'src[4 * width - 4]', because we don't know if alpha is the first or the // last byte of the quadruplet. const int limit = (width - 1) & ~7; for (j = 0; j < height; ++j) { const __m128i* src = (const __m128i*)argb; for (i = 0; i < limit; i += 8) { // load 32 argb bytes const __m128i a0 = _mm_loadu_si128(src + 0); const __m128i a1 = _mm_loadu_si128(src + 1); const __m128i b0 = _mm_and_si128(a0, a_mask); const __m128i b1 = _mm_and_si128(a1, a_mask); const __m128i c0 = _mm_packs_epi32(b0, b1); const __m128i d0 = _mm_packus_epi16(c0, c0); // store _mm_storel_epi64((__m128i*)&alpha[i], d0); // accumulate eight alpha 'and' in parallel all_alphas = _mm_and_si128(all_alphas, d0); src += 2; } for (; i < width; ++i) { const uint32_t alpha_value = argb[4 * i]; alpha[i] = alpha_value; alpha_and &= alpha_value; } argb += argb_stride; alpha += alpha_stride; } // Combine the eight alpha 'and' into a 8-bit mask. alpha_and &= _mm_movemask_epi8(_mm_cmpeq_epi8(all_alphas, all_0xff)); return (alpha_and == 0xff); }
void Convert444toNV12(LPBYTE input, int width, int inPitch, int outPitch, int height, int startY, int endY, LPBYTE *output) { LPBYTE lumPlane = output[0]; LPBYTE uvPlane = output[1]; __m128i lumMask = _mm_set1_epi32(0x0000FF00); __m128i uvMask = _mm_set1_epi16(0x00FF); for(int y=startY; y<endY; y+=2) { int yPos = y*inPitch; int uvYPos = (y>>1)*outPitch; int lumYPos = y*outPitch; for(int x=0; x<width; x+=4) { LPBYTE lpImagePos = input+yPos+(x*4); int uvPos = uvYPos + x; int lumPos0 = lumYPos + x; int lumPos1 = lumPos0 + outPitch; __m128i line1 = _mm_load_si128((__m128i*)lpImagePos); __m128i line2 = _mm_load_si128((__m128i*)(lpImagePos+inPitch)); //pack lum vals { __m128i packVal = _mm_packs_epi32(_mm_srli_si128(_mm_and_si128(line1, lumMask), 1), _mm_srli_si128(_mm_and_si128(line2, lumMask), 1)); packVal = _mm_packus_epi16(packVal, packVal); *(LPUINT)(lumPlane+lumPos0) = packVal.m128i_u32[0]; *(LPUINT)(lumPlane+lumPos1) = packVal.m128i_u32[1]; } //do average, pack UV vals { __m128i addVal = _mm_add_epi64(_mm_and_si128(line1, uvMask), _mm_and_si128(line2, uvMask)); __m128i avgVal = _mm_srai_epi16(_mm_add_epi64(addVal, _mm_shuffle_epi32(addVal, _MM_SHUFFLE(2, 3, 0, 1))), 2); avgVal = _mm_shuffle_epi32(avgVal, _MM_SHUFFLE(3, 1, 2, 0)); *(LPUINT)(uvPlane+uvPos) = _mm_packus_epi16(avgVal, avgVal).m128i_u32[0]; } } } }
void interpolate_gint16_cubic_sse2 (gpointer op, const gpointer ap, gint len, const gpointer icp, gint astride) { gint i = 0; gint16 *o = op, *a = ap, *ic = icp; __m128i ta, tb, tl1, tl2, th1, th2; __m128i f[2]; const gint16 *c[4] = { (gint16 *) ((gint8 *) a + 0 * astride), (gint16 *) ((gint8 *) a + 1 * astride), (gint16 *) ((gint8 *) a + 2 * astride), (gint16 *) ((gint8 *) a + 3 * astride) }; f[0] = _mm_set_epi16 (ic[1], ic[0], ic[1], ic[0], ic[1], ic[0], ic[1], ic[0]); f[1] = _mm_set_epi16 (ic[3], ic[2], ic[3], ic[2], ic[3], ic[2], ic[3], ic[2]); for (; i < len; i += 8) { ta = _mm_load_si128 ((__m128i *) (c[0] + i)); tb = _mm_load_si128 ((__m128i *) (c[1] + i)); tl1 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[0]); th1 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[0]); ta = _mm_load_si128 ((__m128i *) (c[2] + i)); tb = _mm_load_si128 ((__m128i *) (c[3] + i)); tl2 = _mm_madd_epi16 (_mm_unpacklo_epi16 (ta, tb), f[1]); th2 = _mm_madd_epi16 (_mm_unpackhi_epi16 (ta, tb), f[1]); tl1 = _mm_add_epi32 (tl1, tl2); th1 = _mm_add_epi32 (th1, th2); tl1 = _mm_add_epi32 (tl1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); th1 = _mm_add_epi32 (th1, _mm_set1_epi32 (1 << (PRECISION_S16 - 1))); tl1 = _mm_srai_epi32 (tl1, PRECISION_S16); th1 = _mm_srai_epi32 (th1, PRECISION_S16); tl1 = _mm_packs_epi32 (tl1, th1); _mm_store_si128 ((__m128i *) (o + i), tl1); } }
inline Pixel GetPixelSSE(const Image* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // extend to 16bit p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128()); p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128()); // convert floating point weights to 16bit integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit // prepare the weights __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0)); __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2)); w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3 // multiply each pixel with its weight (2 pixel per SSE mul) __m128i L12 = _mm_mullo_epi16(p12, w12); __m128i L34 = _mm_mullo_epi16(p34, w34); // sum the results __m128i L1234 = _mm_add_epi16(L12, L34); __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2)); __m128i L = _mm_add_epi16(L1234, Lhi); // convert back to 8bit __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256 L8 = _mm_packus_epi16(L8, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(L8); }
void conv_Short2ToShort1(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSshort* src = reinterpret_cast<const LSshort*>(s); s32 num = numSamples >> 2; //8個のshortをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128i izero = _mm_setzero_si128(); __declspec(align(16)) LSshort tmp[8]; const LSshort* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ //32bit整数r0, r1に変換 __m128i t0 = _mm_loadu_si128((const __m128i*)p); __m128i t1 = _mm_cmpgt_epi16(izero, t0); __m128i r0 = _mm_unpackhi_epi16(t0, t1); __m128i r1 = _mm_unpacklo_epi16(t0, t1); __m128i r2 = _mm_add_epi32(r0, _mm_shuffle_epi32(r0, _MM_SHUFFLE(2, 3, 0, 1))); __m128i r3 = _mm_add_epi32(r1, _mm_shuffle_epi32(r1, _MM_SHUFFLE(2, 3, 0, 1))); r2 = _mm_srai_epi32(r2, 1); r3 = _mm_srai_epi32(r3, 1); __m128i r4 = _mm_packs_epi32(r3, r2); _mm_store_si128((__m128i*)tmp, r4); q[0] = tmp[0]; q[1] = tmp[2]; q[2] = tmp[4]; q[3] = tmp[6]; p += 8; q += 4; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; s32 t = (p[j+0] + p[j+1]) >> 1; q[i] = static_cast<LSshort>(t); } }
static void CollectColorRedTransforms(const uint32_t* argb, int stride, int tile_width, int tile_height, int green_to_red, int histo[]) { const __m128i mults_g = _mm_set_epi16( 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 0, CST_5b(green_to_red), 0, CST_5b(green_to_red)); const __m128i mask_g = _mm_set1_epi32(0x00ff00); // green mask const __m128i mask = _mm_set1_epi32(0xff); int y; for (y = 0; y < tile_height; ++y) { const uint32_t* const src = argb + y * stride; int i, x; for (x = 0; x + SPAN <= tile_width; x += SPAN) { uint16_t values[SPAN]; const __m128i in0 = _mm_loadu_si128((__m128i*)&src[x + 0]); const __m128i in1 = _mm_loadu_si128((__m128i*)&src[x + SPAN / 2]); const __m128i A0 = _mm_and_si128(in0, mask_g); // 0 0 | g 0 const __m128i A1 = _mm_and_si128(in1, mask_g); const __m128i B0 = _mm_srli_epi32(in0, 16); // 0 0 | x r const __m128i B1 = _mm_srli_epi32(in1, 16); const __m128i C0 = _mm_mulhi_epi16(A0, mults_g); // 0 0 | x dr const __m128i C1 = _mm_mulhi_epi16(A1, mults_g); const __m128i E0 = _mm_sub_epi8(B0, C0); // x x | x r' const __m128i E1 = _mm_sub_epi8(B1, C1); const __m128i F0 = _mm_and_si128(E0, mask); // 0 0 | 0 r' const __m128i F1 = _mm_and_si128(E1, mask); const __m128i I = _mm_packs_epi32(F0, F1); _mm_storeu_si128((__m128i*)values, I); for (i = 0; i < SPAN; ++i) ++histo[values[i]]; } } { const int left_over = tile_width & (SPAN - 1); if (left_over > 0) { VP8LCollectColorRedTransforms_C(argb + tile_width - left_over, stride, left_over, tile_height, green_to_red, histo); } } }
inline Pixel GetPixelSSE3(const Image<Pixel>* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // convert RGBA RGBA RGBA RGAB to RRRR GGGG BBBB AAAA (AoS to SoA) __m128i p1234 = _mm_unpacklo_epi8(p12, p34); __m128i p34xx = _mm_unpackhi_epi64(p1234, _mm_setzero_si128()); __m128i p1234_8bit = _mm_unpacklo_epi8(p1234, p34xx); // extend to 16bit __m128i pRG = _mm_unpacklo_epi8(p1234_8bit, _mm_setzero_si128()); __m128i pBA = _mm_unpackhi_epi8(p1234_8bit, _mm_setzero_si128()); // convert weights to integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, weighti); // 32->2x16bit //outRG = [w1*R1 + w2*R2 | w3*R3 + w4*R4 | w1*G1 + w2*G2 | w3*G3 + w4*G4] __m128i outRG = _mm_madd_epi16(pRG, weighti); //outBA = [w1*B1 + w2*B2 | w3*B3 + w4*B4 | w1*A1 + w2*A2 | w3*A3 + w4*A4] __m128i outBA = _mm_madd_epi16(pBA, weighti); // horizontal add that will produce the output values (in 32bit) __m128i out = _mm_hadd_epi32(outRG, outBA); out = _mm_srli_epi32(out, 8); // divide by 256 // convert 32bit->8bit out = _mm_packus_epi32(out, _mm_setzero_si128()); out = _mm_packus_epi16(out, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(out); }
static void GF_FUNC_ALIGN VS_CC float_to_dst_gb_16bit(const float *srcp, uint8_t *d, int width, int height, int src_stride, int dst_stride, float th, int bits) { uint16_t *dstp = (uint16_t *)d; dst_stride /= 2; __m128i tmax = _mm_set1_epi32((1 << bits) - 1); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 8) { __m128i xmm0 = _mm_cvtps_epi32(_mm_load_ps(srcp + x)); __m128i xmm1 = _mm_cvtps_epi32(_mm_load_ps(srcp + x + 4)); xmm0 = _mm_packs_epi32(mm_min_epi32(tmax, xmm0), mm_min_epi32(tmax, xmm1)); _mm_store_si128((__m128i *)(dstp + x), xmm0); } srcp += src_stride; dstp += dst_stride; } }
void conv_Float1ToShort2(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128 fcoff = _mm_set1_ps(32768.0f); __declspec(align(16)) LSshort tmp[8]; const LSfloat* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_mul_ps(_mm_loadu_ps(p), fcoff); __m128i s32_0 = _mm_cvtps_epi32(f32_0); __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0); _mm_store_si128((__m128i*)tmp, s16_0); q[0] = tmp[0]; q[1] = tmp[0]; q[2] = tmp[1]; q[3] = tmp[1]; q[4] = tmp[2]; q[5] = tmp[2]; q[6] = tmp[3]; q[7] = tmp[3]; p += 4; q += 8; } for(s32 i=0; i<rem; ++i){ s32 j=i<<1; q[j+0] = q[j+1] = toShort(p[i]); } }
inline void ClampBufferToS16(s16 *out, const s32 *in, size_t size, s8 volShift) { #ifdef _M_SSE // Size will always be 16-byte aligned as the hwBlockSize is. while (size >= 8) { __m128i in1 = _mm_loadu_si128((__m128i *)in); __m128i in2 = _mm_loadu_si128((__m128i *)(in + 4)); __m128i packed = _mm_packs_epi32(in1, in2); if (useShift) { packed = _mm_srai_epi16(packed, volShift); } _mm_storeu_si128((__m128i *)out, packed); out += 8; in += 8; size -= 8; } #elif PPSSPP_ARCH(ARM_NEON) int16x4_t signedVolShift = vdup_n_s16 (-volShift); // Can only dynamic-shift right, but by a signed integer while (size >= 8) { int32x4_t in1 = vld1q_s32(in); int32x4_t in2 = vld1q_s32(in + 4); int16x4_t packed1 = vqmovn_s32(in1); int16x4_t packed2 = vqmovn_s32(in2); if (useShift) { packed1 = vshl_s16(packed1, signedVolShift); packed2 = vshl_s16(packed2, signedVolShift); } vst1_s16(out, packed1); vst1_s16(out + 4, packed2); out += 8; in += 8; size -= 8; } #endif // This does the remainder if SIMD was used, otherwise it does it all. for (size_t i = 0; i < size; i++) { out[i] = clamp_s16(useShift ? (in[i] >> volShift) : in[i]); } }
static void YuvToArgbRowSSE2(const uint8_t* y, const uint8_t* u, const uint8_t* v, uint8_t* dst, int len) { int n; for (n = 0; n + 2 <= len; n += 2) { const __m128i uv_0 = LoadUVPart(u[0], v[0]); const __m128i tmp0_1 = GetRGBA32bWithUV(y[0], uv_0); const __m128i tmp0_2 = GetRGBA32bWithUV(y[1], uv_0); const __m128i tmp1_1 = _mm_shuffle_epi32(tmp0_1, _MM_SHUFFLE(2, 1, 0, 3)); const __m128i tmp1_2 = _mm_shuffle_epi32(tmp0_2, _MM_SHUFFLE(2, 1, 0, 3)); const __m128i tmp2_1 = _mm_packs_epi32(tmp1_1, tmp1_2); const __m128i tmp3 = _mm_packus_epi16(tmp2_1, tmp2_1); _mm_storel_epi64((__m128i*)dst, tmp3); dst += 4 * 2; y += 2; ++u; ++v; } // Finish off if (len & 1) { VP8YuvToArgb(y[0], u[0], v[0], dst); } }
/* Modified from volk_32f_s32f_convert_16i_a_simd2. Removed clipping */ void srslte_vec_convert_fi_simd(float *x, int16_t *z, float scale, uint32_t len) { #ifdef LV_HAVE_SSE unsigned int number = 0; const unsigned int eighthPoints = len / 8; const float* inputVectorPtr = (const float*)x; int16_t* outputVectorPtr = z; __m128 vScalar = _mm_set_ps1(scale); __m128 inputVal1, inputVal2; __m128i intInputVal1, intInputVal2; __m128 ret1, ret2; for(;number < eighthPoints; number++){ inputVal1 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; inputVal2 = _mm_load_ps(inputVectorPtr); inputVectorPtr += 4; ret1 = _mm_mul_ps(inputVal1, vScalar); ret2 = _mm_mul_ps(inputVal2, vScalar); intInputVal1 = _mm_cvtps_epi32(ret1); intInputVal2 = _mm_cvtps_epi32(ret2); intInputVal1 = _mm_packs_epi32(intInputVal1, intInputVal2); _mm_store_si128((__m128i*)outputVectorPtr, intInputVal1); outputVectorPtr += 8; } number = eighthPoints * 8; for(; number < len; number++){ z[number] = (int16_t) (x[number] * scale); } #endif }
void conv_Float2ToShort1(void* dst, const void* s, s32 numSamples) { LSshort* d = reinterpret_cast<LSshort*>(dst); const LSfloat* src = reinterpret_cast<const LSfloat*>(s); s32 num = numSamples >> 2; //4個のfloatをまとめて処理 s32 offset = num << 2; s32 rem = numSamples - offset; const __m128 fcoff = _mm_set1_ps(32768.0f*0.5f); //half __declspec(align(16)) LSshort tmp[8]; const LSfloat* p = src; LSshort* q = d; for(s32 i=0; i<num; ++i){ __m128 f32_0 = _mm_loadu_ps(p); __m128 f32_1 = _mm_add_ps(f32_0, _mm_shuffle_ps(f32_0, f32_0, _MM_SHUFFLE(2, 3, 0, 1))); __m128 f32_2 = _mm_mul_ps(f32_1, fcoff); __m128i s32_0 = _mm_cvtps_epi32(f32_2); __m128i s16_0 = _mm_packs_epi32(s32_0, s32_0); _mm_store_si128((__m128i*)tmp, s16_0); q[0] = tmp[0]; q[1] = tmp[2]; p += 4; q += 2; } for(s32 i=0; i<rem; ++i){ s32 j = i<<1; f32 v = 0.5f*(src[j+0] + src[j+1]); q[i] = toShort(v); } }
/** * SSE Implementation of \c cnsFormula (subroutine of cnsResponse). * \c scale, \c gaussI2 and \c regVar are 32bit floats (gaussI2 as A and B). * \c sobelX, \c sobelY, \c gaussI are signed short. * \c result is a packed vector of unsigned signed 8bit number with the x and y component * alternating and \c offset (unsigned char) added. */ ALWAYSINLINE static void cnsFormula(__m128i& result, __m128i sobelX, __m128i sobelY, __m128i& gaussI, const __m128& gaussI2A, const __m128& gaussI2B, const __m128& scale, const __m128& regVar, __m128i offset) { __m128 gaussIA = _mm_cvtepi32_ps(_mm_unpacklo_epi16(gaussI, _mm_setzero_si128())); __m128 gaussIB = _mm_cvtepi32_ps(_mm_unpackhi_epi16(gaussI, _mm_setzero_si128())); __m128 factorA = _mm_add_ps(_mm_sub_ps(gaussI2A, _mm_mul_ps(gaussIA, gaussIA)), regVar); // gaussI2-gaussI^2+regVar __m128 factorB = _mm_add_ps(_mm_sub_ps(gaussI2B, _mm_mul_ps(gaussIB, gaussIB)), regVar); factorA = _mm_mul_ps(_mm_rsqrt_ps(factorA), scale); // scale/sqrt(gaussI2-gaussI^2+regVar) factorB = _mm_mul_ps(_mm_rsqrt_ps(factorB), scale); // (2^-11)*sobelX*(scale/sqrt(gaussI2-gaussI^2+regVar)) __m128i factor = _mm_packs_epi32(_mm_cvtps_epi32(factorA), _mm_cvtps_epi32(factorB)); __m128i resultXepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelX, 5), factor); __m128i resultYepi16 = _mm_mulhi_epi16(_mm_slli_epi16(sobelY, 5), factor); // Convert to 8bit and interleave X and Y // the second argument of packs duplicates values to higher bytes, but these are ignored later, unpacklo interleaves X and Y __m128i resultepi8 = _mm_unpacklo_epi8(_mm_packs_epi16(resultXepi16, resultXepi16), _mm_packs_epi16(resultYepi16, resultYepi16)); result = _mm_add_epi8(resultepi8, offset); // add offset, switching to epu8 }
//vz optimized template specialization template<> void cvtScale_<short, short, float>( const short* src, size_t sstep, short* dst, size_t dstep, Size size, float scale, float shift ) { sstep /= sizeof(src[0]); dstep /= sizeof(dst[0]); for( ; size.height--; src += sstep, dst += dstep ) { int x = 0; #if CV_SSE2 if(USE_SSE2) { __m128 scale128 = _mm_set1_ps (scale); __m128 shift128 = _mm_set1_ps (shift); for(; x <= size.width - 8; x += 8 ) { __m128i r0 = _mm_loadl_epi64((const __m128i*)(src + x)); __m128i r1 = _mm_loadl_epi64((const __m128i*)(src + x + 4)); __m128 rf0 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r0, r0), 16)); __m128 rf1 =_mm_cvtepi32_ps(_mm_srai_epi32(_mm_unpacklo_epi16(r1, r1), 16)); rf0 = _mm_add_ps(_mm_mul_ps(rf0, scale128), shift128); rf1 = _mm_add_ps(_mm_mul_ps(rf1, scale128), shift128); r0 = _mm_cvtps_epi32(rf0); r1 = _mm_cvtps_epi32(rf1); r0 = _mm_packs_epi32(r0, r1); _mm_storeu_si128((__m128i*)(dst + x), r0); } } #endif for(; x < size.width; x++ ) dst[x] = saturate_cast<short>(src[x]*scale + shift); } }
/* * vPMD raw receive routine, only accept(nb_pkts >= RTE_IXGBE_DESCS_PER_LOOP) * * Notice: * - nb_pkts < RTE_IXGBE_DESCS_PER_LOOP, just return no packet * - nb_pkts > RTE_IXGBE_MAX_RX_BURST, only scan RTE_IXGBE_MAX_RX_BURST * numbers of DD bit * - floor align nb_pkts to a RTE_IXGBE_DESC_PER_LOOP power-of-two * - don't support ol_flags for rss and csum err */ static inline uint16_t _recv_raw_pkts_vec(struct ixgbe_rx_queue *rxq, struct rte_mbuf **rx_pkts, uint16_t nb_pkts, uint8_t *split_packet) { volatile union ixgbe_adv_rx_desc *rxdp; struct ixgbe_rx_entry *sw_ring; uint16_t nb_pkts_recd; int pos; uint64_t var; __m128i shuf_msk; __m128i crc_adjust = _mm_set_epi16( 0, 0, 0, /* ignore non-length fields */ -rxq->crc_len, /* sub crc on data_len */ 0, /* ignore high-16bits of pkt_len */ -rxq->crc_len, /* sub crc on pkt_len */ 0, 0 /* ignore pkt_type field */ ); __m128i dd_check, eop_check; /* nb_pkts shall be less equal than RTE_IXGBE_MAX_RX_BURST */ nb_pkts = RTE_MIN(nb_pkts, RTE_IXGBE_MAX_RX_BURST); /* nb_pkts has to be floor-aligned to RTE_IXGBE_DESCS_PER_LOOP */ nb_pkts = RTE_ALIGN_FLOOR(nb_pkts, RTE_IXGBE_DESCS_PER_LOOP); /* Just the act of getting into the function from the application is * going to cost about 7 cycles */ rxdp = rxq->rx_ring + rxq->rx_tail; _mm_prefetch((const void *)rxdp, _MM_HINT_T0); /* See if we need to rearm the RX queue - gives the prefetch a bit * of time to act */ if (rxq->rxrearm_nb > RTE_IXGBE_RXQ_REARM_THRESH) ixgbe_rxq_rearm(rxq); /* Before we start moving massive data around, check to see if * there is actually a packet available */ if (!(rxdp->wb.upper.status_error & rte_cpu_to_le_32(IXGBE_RXDADV_STAT_DD))) return 0; /* 4 packets DD mask */ dd_check = _mm_set_epi64x(0x0000000100000001LL, 0x0000000100000001LL); /* 4 packets EOP mask */ eop_check = _mm_set_epi64x(0x0000000200000002LL, 0x0000000200000002LL); /* mask to shuffle from desc. to mbuf */ shuf_msk = _mm_set_epi8( 7, 6, 5, 4, /* octet 4~7, 32bits rss */ 15, 14, /* octet 14~15, low 16 bits vlan_macip */ 13, 12, /* octet 12~13, 16 bits data_len */ 0xFF, 0xFF, /* skip high 16 bits pkt_len, zero out */ 13, 12, /* octet 12~13, low 16 bits pkt_len */ 0xFF, 0xFF, /* skip 32 bit pkt_type */ 0xFF, 0xFF ); /* Cache is empty -> need to scan the buffer rings, but first move * the next 'n' mbufs into the cache */ sw_ring = &rxq->sw_ring[rxq->rx_tail]; /* A. load 4 packet in one loop * [A*. mask out 4 unused dirty field in desc] * B. copy 4 mbuf point from swring to rx_pkts * C. calc the number of DD bits among the 4 packets * [C*. extract the end-of-packet bit, if requested] * D. fill info. from desc to mbuf */ for (pos = 0, nb_pkts_recd = 0; pos < nb_pkts; pos += RTE_IXGBE_DESCS_PER_LOOP, rxdp += RTE_IXGBE_DESCS_PER_LOOP) { __m128i descs[RTE_IXGBE_DESCS_PER_LOOP]; __m128i pkt_mb1, pkt_mb2, pkt_mb3, pkt_mb4; __m128i zero, staterr, sterr_tmp1, sterr_tmp2; __m128i mbp1, mbp2; /* two mbuf pointer in one XMM reg. */ /* B.1 load 1 mbuf point */ mbp1 = _mm_loadu_si128((__m128i *)&sw_ring[pos]); /* Read desc statuses backwards to avoid race condition */ /* A.1 load 4 pkts desc */ descs[3] = _mm_loadu_si128((__m128i *)(rxdp + 3)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos], mbp1); /* B.1 load 1 mbuf point */ mbp2 = _mm_loadu_si128((__m128i *)&sw_ring[pos+2]); descs[2] = _mm_loadu_si128((__m128i *)(rxdp + 2)); /* B.1 load 2 mbuf point */ descs[1] = _mm_loadu_si128((__m128i *)(rxdp + 1)); descs[0] = _mm_loadu_si128((__m128i *)(rxdp)); /* B.2 copy 2 mbuf point into rx_pkts */ _mm_storeu_si128((__m128i *)&rx_pkts[pos+2], mbp2); if (split_packet) { rte_mbuf_prefetch_part2(rx_pkts[pos]); rte_mbuf_prefetch_part2(rx_pkts[pos + 1]); rte_mbuf_prefetch_part2(rx_pkts[pos + 2]); rte_mbuf_prefetch_part2(rx_pkts[pos + 3]); } /* avoid compiler reorder optimization */ rte_compiler_barrier(); /* D.1 pkt 3,4 convert format from desc to pktmbuf */ pkt_mb4 = _mm_shuffle_epi8(descs[3], shuf_msk); pkt_mb3 = _mm_shuffle_epi8(descs[2], shuf_msk); /* D.1 pkt 1,2 convert format from desc to pktmbuf */ pkt_mb2 = _mm_shuffle_epi8(descs[1], shuf_msk); pkt_mb1 = _mm_shuffle_epi8(descs[0], shuf_msk); /* C.1 4=>2 filter staterr info only */ sterr_tmp2 = _mm_unpackhi_epi32(descs[3], descs[2]); /* C.1 4=>2 filter staterr info only */ sterr_tmp1 = _mm_unpackhi_epi32(descs[1], descs[0]); /* set ol_flags with vlan packet type */ desc_to_olflags_v(descs, &rx_pkts[pos]); /* D.2 pkt 3,4 set in_port/nb_seg and remove crc */ pkt_mb4 = _mm_add_epi16(pkt_mb4, crc_adjust); pkt_mb3 = _mm_add_epi16(pkt_mb3, crc_adjust); /* C.2 get 4 pkts staterr value */ zero = _mm_xor_si128(dd_check, dd_check); staterr = _mm_unpacklo_epi32(sterr_tmp1, sterr_tmp2); /* D.3 copy final 3,4 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+3]->rx_descriptor_fields1, pkt_mb4); _mm_storeu_si128((void *)&rx_pkts[pos+2]->rx_descriptor_fields1, pkt_mb3); /* D.2 pkt 1,2 set in_port/nb_seg and remove crc */ pkt_mb2 = _mm_add_epi16(pkt_mb2, crc_adjust); pkt_mb1 = _mm_add_epi16(pkt_mb1, crc_adjust); /* C* extract and record EOP bit */ if (split_packet) { __m128i eop_shuf_mask = _mm_set_epi8( 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0xFF, 0x04, 0x0C, 0x00, 0x08 ); /* and with mask to extract bits, flipping 1-0 */ __m128i eop_bits = _mm_andnot_si128(staterr, eop_check); /* the staterr values are not in order, as the count * count of dd bits doesn't care. However, for end of * packet tracking, we do care, so shuffle. This also * compresses the 32-bit values to 8-bit */ eop_bits = _mm_shuffle_epi8(eop_bits, eop_shuf_mask); /* store the resulting 32-bit value */ *(int *)split_packet = _mm_cvtsi128_si32(eop_bits); split_packet += RTE_IXGBE_DESCS_PER_LOOP; /* zero-out next pointers */ rx_pkts[pos]->next = NULL; rx_pkts[pos + 1]->next = NULL; rx_pkts[pos + 2]->next = NULL; rx_pkts[pos + 3]->next = NULL; } /* C.3 calc available number of desc */ staterr = _mm_and_si128(staterr, dd_check); staterr = _mm_packs_epi32(staterr, zero); /* D.3 copy final 1,2 data to rx_pkts */ _mm_storeu_si128((void *)&rx_pkts[pos+1]->rx_descriptor_fields1, pkt_mb2); _mm_storeu_si128((void *)&rx_pkts[pos]->rx_descriptor_fields1, pkt_mb1); /* C.4 calc avaialbe number of desc */ var = __builtin_popcountll(_mm_cvtsi128_si64(staterr)); nb_pkts_recd += var; if (likely(var != RTE_IXGBE_DESCS_PER_LOOP)) break; } /* Update our internal tail pointer */ rxq->rx_tail = (uint16_t)(rxq->rx_tail + nb_pkts_recd); rxq->rx_tail = (uint16_t)(rxq->rx_tail & (rxq->nb_rx_desc - 1)); rxq->rxrearm_nb = (uint16_t)(rxq->rxrearm_nb + nb_pkts_recd); return nb_pkts_recd; }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref, int16_t* out) { const __m128i zero = _mm_setzero_si128(); const __m128i seven = _mm_set1_epi16(7); const __m128i k7500 = _mm_set1_epi32(7500); const __m128i k14500 = _mm_set1_epi32(14500); const __m128i k51000 = _mm_set1_epi32(51000); const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16)); const __m128i k5352_2217 = _mm_set_epi16(5352, 2217, 5352, 2217, 5352, 2217, 5352, 2217); const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352, 2217, -5352, 2217, -5352); __m128i v01, v32; // Difference between src and ref and initial transpose. { // Load src and convert to 16b. const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]); const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]); const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]); const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]); const __m128i src_0 = _mm_unpacklo_epi8(src0, zero); const __m128i src_1 = _mm_unpacklo_epi8(src1, zero); const __m128i src_2 = _mm_unpacklo_epi8(src2, zero); const __m128i src_3 = _mm_unpacklo_epi8(src3, zero); // Load ref and convert to 16b. const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]); const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]); const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]); const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]); const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero); const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero); const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero); const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero); // Compute difference. const __m128i diff0 = _mm_sub_epi16(src_0, ref_0); const __m128i diff1 = _mm_sub_epi16(src_1, ref_1); const __m128i diff2 = _mm_sub_epi16(src_2, ref_2); const __m128i diff3 = _mm_sub_epi16(src_3, ref_3); // Transpose. // 00 01 02 03 0 0 0 0 // 10 11 12 13 0 0 0 0 // 20 21 22 23 0 0 0 0 // 30 31 32 33 0 0 0 0 const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1); const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // a02 a12 a22 a32 a03 a13 a23 a33 // a00 a10 a20 a30 a01 a11 a21 a31 // a03 a13 a23 a33 a02 a12 a22 a32 } // First pass and subsequent transpose. { // Same operations are done on the (0,3) and (1,2) pairs. // b0 = (a0 + a3) << 3 // b1 = (a1 + a2) << 3 // b3 = (a0 - a3) << 3 // b2 = (a1 - a2) << 3 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i b01 = _mm_slli_epi16(a01, 3); const __m128i b32 = _mm_slli_epi16(a32, 3); const __m128i b11 = _mm_unpackhi_epi64(b01, b01); const __m128i b22 = _mm_unpackhi_epi64(b32, b32); // e0 = b0 + b1 // e2 = b0 - b1 const __m128i e0 = _mm_add_epi16(b01, b11); const __m128i e2 = _mm_sub_epi16(b01, b11); const __m128i e02 = _mm_unpacklo_epi64(e0, e2); // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12 // e3 = (b3 * 2217 - b2 * 5352 + 7500) >> 12 const __m128i b23 = _mm_unpacklo_epi16(b22, b32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k14500); const __m128i d3 = _mm_add_epi32(c3, k7500); const __m128i e1 = _mm_srai_epi32(d1, 12); const __m128i e3 = _mm_srai_epi32(d3, 12); const __m128i e13 = _mm_packs_epi32(e1, e3); // Transpose. // 00 01 02 03 20 21 22 23 // 10 11 12 13 30 31 32 33 const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13); const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13); // 00 10 01 11 02 12 03 13 // 20 30 21 31 22 32 23 33 const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2)); // 02 12 22 32 03 13 23 33 // 00 10 20 30 01 11 21 31 // 03 13 23 33 02 12 22 32 } // Second pass { // Same operations are done on the (0,3) and (1,2) pairs. // a0 = v0 + v3 // a1 = v1 + v2 // a3 = v0 - v3 // a2 = v1 - v2 const __m128i a01 = _mm_add_epi16(v01, v32); const __m128i a32 = _mm_sub_epi16(v01, v32); const __m128i a11 = _mm_unpackhi_epi64(a01, a01); const __m128i a22 = _mm_unpackhi_epi64(a32, a32); // d0 = (a0 + a1 + 7) >> 4; // d2 = (a0 - a1 + 7) >> 4; const __m128i b0 = _mm_add_epi16(a01, a11); const __m128i b2 = _mm_sub_epi16(a01, a11); const __m128i c0 = _mm_add_epi16(b0, seven); const __m128i c2 = _mm_add_epi16(b2, seven); const __m128i d0 = _mm_srai_epi16(c0, 4); const __m128i d2 = _mm_srai_epi16(c2, 4); // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16) // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16) const __m128i b23 = _mm_unpacklo_epi16(a22, a32); const __m128i c1 = _mm_madd_epi16(b23, k5352_2217); const __m128i c3 = _mm_madd_epi16(b23, k2217_5352); const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one); const __m128i d3 = _mm_add_epi32(c3, k51000); const __m128i e1 = _mm_srai_epi32(d1, 16); const __m128i e3 = _mm_srai_epi32(d3, 16); const __m128i f1 = _mm_packs_epi32(e1, e1); const __m128i f3 = _mm_packs_epi32(e3, e3); // f1 = f1 + (a3 != 0); // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the // desired (0, 1), we add one earlier through k12000_plus_one. const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero)); _mm_storel_epi64((__m128i*)&out[ 0], d0); _mm_storel_epi64((__m128i*)&out[ 4], g1); _mm_storel_epi64((__m128i*)&out[ 8], d2); _mm_storel_epi64((__m128i*)&out[12], f3); } }
void BrushToolEdit::drawInner(const QPoint &pt, float strength) { float fixedStrength = params.strength; strength *= fixedStrength; auto color = params.color; std::array<int, 3> colorParts = Terrain::expandColor(color); __m128 colorMM = _mm_setr_ps(colorParts[0], colorParts[1], colorParts[2], 0); SseRoundingModeScope roundingModeScope(_MM_ROUND_NEAREST); (void) roundingModeScope; switch (tool->type()) { case BrushType::Blur: drawBlur(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Smoothen: drawSmoothen(pt, std::min(strength / 5.f, 4.f)); break; case BrushType::Raise: case BrushType::Lower: if (tool->type() == BrushType::Lower) { fixedStrength = -fixedStrength; strength = -strength; } switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength *= 3.f; drawRaiseLower(pt, [=](float ¤t, float before, float tip) { (void) before; current -= tip * strength; }); break; case BrushPressureMode::Constant: if (tool->type() == BrushType::Lower) { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::max(current, before - tip * fixedStrength)); }); } else { drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(std::min(current, before - tip * fixedStrength)); }); } break; case BrushPressureMode::Adjustable: drawRaiseLower(pt, [=](float ¤t, float before, float tip) { current = Terrain::quantizeOne(before - tip * strength); }); break; } break; case BrushType::Paint: switch (params.pressureMode) { case BrushPressureMode::AirBrush: strength = 1.f - std::exp2(-strength); drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { (void) before; // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); auto factor = _mm_set1_ps(tip * strength); // blend auto diff = _mm_sub_ps(colorMM, currentMF); diff = _mm_mul_ps(diff, factor); currentMF = _mm_add_ps(currentMF, diff); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Constant: fixedStrength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert current color to FP32 auto currentMM = _mm_castps_si128(_mm_load_ss(reinterpret_cast<float *>(¤t))); currentMM = _mm_unpacklo_epi8(currentMM, _mm_setzero_si128()); currentMM = _mm_unpacklo_epi16(currentMM, _mm_setzero_si128()); auto currentMF = _mm_cvtepi32_ps(currentMM); // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // beforeMM = _mm_add_ps(beforeMM, globalDitherSampler.getM128()); // use "before" image to which way of color change is possible, and // compute possible range of result color auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * fixedStrength); auto adddiff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, adddiff); auto diffDir = _mm_cmpgt_ps(diff, _mm_setzero_ps()); // compute output image auto out1 = _mm_max_ps(currentMF, beforeMF); auto out2 = _mm_min_ps(currentMF, beforeMF); currentMF = _mm_or_ps(_mm_and_ps(diffDir, out1), _mm_andnot_ps(diffDir, out2)); // convert to RGB32 currentMF = _mm_add_ps(currentMF, globalDitherSampler.getM128()); currentMM = _mm_cvttps_epi32(currentMF); currentMM = _mm_packs_epi32(currentMM, currentMM); currentMM = _mm_packus_epi16(currentMM, currentMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(currentMM)); }); break; case BrushPressureMode::Adjustable: strength *= 0.01f; drawColor(pt, [=](quint32 ¤t, quint32 before, float tip) { // convert before color to FP32 auto beforeMM = _mm_setr_epi32(before, 0, 0, 0); beforeMM = _mm_unpacklo_epi8(beforeMM, _mm_setzero_si128()); beforeMM = _mm_unpacklo_epi16(beforeMM, _mm_setzero_si128()); auto beforeMF = _mm_cvtepi32_ps(beforeMM); // blend auto diff = _mm_sub_ps(colorMM, beforeMF); auto factor = _mm_set1_ps(tip * strength); diff = _mm_mul_ps(diff, factor); beforeMF = _mm_add_ps(beforeMF, diff); // convert to RGB32 beforeMF = _mm_add_ps(beforeMF, globalDitherSampler.getM128()); beforeMM = _mm_cvttps_epi32(beforeMF); beforeMM = _mm_packs_epi32(beforeMM, beforeMM); beforeMM = _mm_packus_epi16(beforeMM, beforeMM); _mm_store_ss(reinterpret_cast<float *>(¤t), _mm_castsi128_ps(beforeMM)); }); break; } break; } }