static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1, uint32_t a2, uint32_t a3) { const __m128i avg1 = Average2_128i(a0, a1); const __m128i avg2 = Average2_128i(a2, a3); const __m128i sum = _mm_add_epi16(avg2, avg1); const __m128i avg3 = _mm_srli_epi16(sum, 1); const __m128i A0 = _mm_packus_epi16(avg3, avg3); const uint32_t output = _mm_cvtsi128_si32(A0); return output; }
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) { const __m128i zero = _mm_setzero_si128(); const __m128i avg1 = Average2_128i(a0, a2); const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero); const __m128i sum = _mm_add_epi16(avg1, A1); const __m128i avg2 = _mm_srli_epi16(sum, 1); const __m128i A2 = _mm_packus_epi16(avg2, avg2); const uint32_t output = _mm_cvtsi128_si32(A2); return output; }
static INLINE unsigned int masked_sad8xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { const __m128i src = _mm_unpacklo_epi64( _mm_loadl_epi64((const __m128i *)src_ptr), _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride])); const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr); const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]); const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr); const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]); const __m128i m = _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr), _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data_l = _mm_unpacklo_epi8(a0, b0); const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv); __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l); pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS); const __m128i data_r = _mm_unpacklo_epi8(a1, b1); const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv); __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r); pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_l, pred_r); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } int32_t sad = _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8)); return (sad + 31) >> 6; }
void png_read_filter_row_paeth3_sse(png_row_infop row_info, png_bytep row, png_const_bytep prev_row) { png_size_t i; png_bytep rp = row; png_const_bytep prp = prev_row; __m128i npix = _mm_cvtsi32_si128(*(uint32_t*)rp); __m128i ppix = _mm_setzero_si128(); // Same as 'a' in C version. __m128i prppix = _mm_setzero_si128(); // Same as 'c' in C version. const __m128i zero = _mm_setzero_si128(); for (i = 0; i < row_info->rowbytes; i += 3, rp += 3, prp += 3) { __m128i prpix = _mm_cvtsi32_si128(*(uint32_t*)prp); // Same as 'b' in C ver. __m128i pix, pa, pb, pc, temp; prpix = _mm_unpacklo_epi8(prpix, zero); temp = _mm_sub_epi16(prpix, prppix); // p = b - c pc = _mm_sub_epi16(ppix, prppix); // pc = a - c #ifndef __SSSE3__ pa = _mm_max_epi16(temp, _mm_sub_epi16(prppix, prpix)); pb = _mm_max_epi16(pc, _mm_sub_epi16(prppix, ppix)); temp = _mm_add_epi16(temp, pc); pc = _mm_max_epi16(temp, _mm_sub_epi16(zero, temp)); #else pa = _mm_abs_epi16(temp); // pa = abs(p) pb = _mm_abs_epi16(pc); // pb = abs(pc) temp = _mm_add_epi16(temp, pc); pc = _mm_abs_epi16(temp); // pc = abs(p + pc) #endif temp = _mm_cmplt_epi16(pb, pa); // if (pb < pa) pa = pb, a = b pa = _mm_andnot_si128(temp, pa); pa = _mm_or_si128(pa, _mm_and_si128(temp, pb)); ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prpix)); pix = npix; npix = _mm_cvtsi32_si128(*(uint32_t*)(rp + 3)); temp = _mm_cmplt_epi16(pc, pa); // if (pc < pa) a = c ppix = _mm_andnot_si128(temp, ppix); ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prppix)); pix = _mm_unpacklo_epi8(pix, zero); prppix = prpix; ppix = _mm_add_epi16(ppix, pix); ppix = _mm_slli_epi16(ppix, 8); ppix = _mm_srli_epi16(ppix, 8); pix = _mm_packus_epi16(ppix, zero); *(uint32_t*)rp = _mm_cvtsi128_si32(pix); } }
static void TransformAC3(const int16_t* in, uint8_t* dst) { static const int kC1 = 20091 + (1 << 16); static const int kC2 = 35468; const __m128i A = _mm_set1_epi16(in[0] + 4); const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2)); const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1)); const int c1 = MUL(in[1], kC2); const int d1 = MUL(in[1], kC1); const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1); const __m128i B = _mm_adds_epi16(A, CD); const __m128i m0 = _mm_adds_epi16(B, d4); const __m128i m1 = _mm_adds_epi16(B, c4); const __m128i m2 = _mm_subs_epi16(B, c4); const __m128i m3 = _mm_subs_epi16(B, d4); const __m128i zero = _mm_setzero_si128(); // Load the source pixels. __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS)); __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS)); __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS)); __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS)); // Convert to 16b. dst0 = _mm_unpacklo_epi8(dst0, zero); dst1 = _mm_unpacklo_epi8(dst1, zero); dst2 = _mm_unpacklo_epi8(dst2, zero); dst3 = _mm_unpacklo_epi8(dst3, zero); // Add the inverse transform. dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3)); dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3)); dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3)); dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3)); // Unsigned saturate to 8b. dst0 = _mm_packus_epi16(dst0, dst0); dst1 = _mm_packus_epi16(dst1, dst1); dst2 = _mm_packus_epi16(dst2, dst2); dst3 = _mm_packus_epi16(dst3, dst3); // Store the results. *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0); *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1); *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2); *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3); }
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y, int N) { opus_int i, dataSize16; opus_int32 sum; __m128i inVec1_76543210, inVec1_FEDCBA98, acc1; __m128i inVec2_76543210, inVec2_FEDCBA98, acc2; sum = 0; dataSize16 = N & ~15; acc1 = _mm_setzero_si128(); acc2 = _mm_setzero_si128(); for (i=0;i<dataSize16;i+=16) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8])); inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98); acc1 = _mm_add_epi32(acc1, inVec1_76543210); acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98); } acc1 = _mm_add_epi32( acc1, acc2 ); if (N - i >= 8) { inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0])); inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0])); inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210); acc1 = _mm_add_epi32(acc1, inVec1_76543210); i += 8; } acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1)); acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E)); sum += _mm_cvtsi128_si32(acc1); for (;i<N;i++) { sum = silk_SMLABB(sum, x[i], y[i]); } return sum; }
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); const __m128i V1 = _mm_add_epi16(C0, C1); const __m128i V2 = _mm_sub_epi16(V1, C2); const __m128i b = _mm_packus_epi16(V2, V2); const uint32_t output = _mm_cvtsi128_si32(b); return output; }
void scanCharDataContentwithSTTNI(SAX2Processor* saxProcessor) { unsigned int length = yylim - yycur; unsigned char* data = (unsigned char*)yycur; if( *data == '<' || *data == '&' || *data == ']') return; unsigned int dataLen = 0; // initialize the one byte encoding rule and nonCharaData rule const __m128i asciiCharData = _mm_set_epi8(0,0,0,0,0,0,0x7F,0x5E,0x5C,0x3D, 0x3B,0x27,0x25,0x20,0,0); const __m128i nonCharData = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0x5D,0x3C,0x26,0x0D,0x0A); do { // special new line processing for ‘x0A’,‘x0D’ if( *data == '\0' ) { saxProcessor->newLine((char*)data); data++; length--; } else if(*data == '\0') { saxProcessor->newLine((char*)data); if( *(data+1) == '\0' ) { data += 2; length -= 2; yycur++; } else { *data = '\0'; data++; length--; } } while( length > 0 ) { if( length >= 16 ) dataLen = 16; else dataLen = length; const __m128i mData = _mm_loadu_si128((__m128i*)data); // locate the Character Data part with the nonCharaData characters int index = _mm_cmpestri(nonCharData, 5, mData, dataLen, _SIDD_CMP_EQUAL_ANY); if( index == 0 ) break; if( index > dataLen ) index = dataLen; bool shouldBreak = index < dataLen ? true : false; // check the one byte encoding rule(ASCII) unsigned int mask = _mm_cvtsi128_si32(_mm_cmpestrm(asciiCharData, 10, mData, index, _SIDD_CMP_RANGES|_SIDD_MASKED_NEGATIVE_POLARITY)); // if not all hit ASCII, continue to check other Unicode rules if( mask == 0 || recogUnicodeRange(mData, index, ~mask)) { data += index; length -= index; if( shouldBreak ) break; } else { break; } } unsigned int passLen = (char*)data - yycur; if( passLen == 0 ) break; // report Character Data to user saxProcessor->reportCharDataContent(yycur, passLen); yycur += passLen; YYSWITCHBUFFER; } while( length >= STTNISTRLENLIMIT && (*data == '\0' || *data == '\0') ); }
static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride, uint8_t *dst, ptrdiff_t dst_stride) { __m128i A = _mm_cvtsi32_si128(*(const int *)src); __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride)); __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2)); __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3)); // 00 10 01 11 02 12 03 13 const __m128i tr0_0 = _mm_unpacklo_epi8(A, B); // 20 30 21 31 22 32 23 33 const __m128i tr0_1 = _mm_unpacklo_epi8(C, D); // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33 A = _mm_unpacklo_epi16(tr0_0, tr0_1); B = _mm_srli_si128(A, 4); C = _mm_srli_si128(A, 8); D = _mm_srli_si128(A, 12); *(int *)(dst) = _mm_cvtsi128_si32(A); *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B); *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C); *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D); }
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) { __m128i v = _mm_loadl_epi64((__m128i const *)dst); const __m128i ones = _mm_set1_epi16(1); highbdRndingPacks(u); highbd_clip(u, 1, bd); v = _mm_add_epi16(v, u[0]); v = _mm_add_epi16(v, ones); v = _mm_srai_epi16(v, 1); *(uint32_t *)dst = _mm_cvtsi128_si32(v); }
static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; uint8 u, v, y; int x = 0; while (width >= 2) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; x += source_dx; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); y = y_buf[x >> 16]; x += source_dx; xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
// Compute the sum of all pixel differences of this MB. static INLINE int sum_diff_16x1(__m128i acc_diff) { const __m128i k_1 = _mm_set1_epi16(1); const __m128i acc_diff_lo = _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_hi = _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8); const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi); const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1); const __m128i hgfe_dcba = _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8)); const __m128i hgfedcba = _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4)); return _mm_cvtsi128_si32(hgfedcba); }
unsigned int aom_get_mb_ss_sse2(const int16_t *src) { __m128i vsum = _mm_setzero_si128(); int i; for (i = 0; i < 32; ++i) { const __m128i v = _mm_loadu_si128((const __m128i *)src); vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v)); src += 8; } vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8)); vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4)); return _mm_cvtsi128_si32(vsum); }
// pixelsNum 0: write all 4 pixels // 1/2/3: residual pixels 1/2/3 static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst, int dst_stride) { if (2 == width) { if (0 == pixelsNum) { *(int *)dst = _mm_cvtsi128_si32(u[0]); *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]); } else if (1 == pixelsNum) { *(int *)dst = _mm_cvtsi128_si32(u[0]); } else if (2 == pixelsNum) { *(int *)dst = _mm_cvtsi128_si32(u[0]); *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); } else if (3 == pixelsNum) { *(int *)dst = _mm_cvtsi128_si32(u[0]); *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]); *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]); } } else { if (0 == pixelsNum) { _mm_storel_epi64((__m128i *)dst, u[0]); _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]); } else if (1 == pixelsNum) { _mm_storel_epi64((__m128i *)dst, u[0]); } else if (2 == pixelsNum) { _mm_storel_epi64((__m128i *)dst, u[0]); _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); } else if (3 == pixelsNum) { _mm_storel_epi64((__m128i *)dst, u[0]); _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]); _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]); } } }
static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum, unsigned int *const sse) { // extract the low lane and add it to the high lane const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse); // unpack sse and sum registers and add const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum); const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum); const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi); // perform the final summation and extract the results const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8)); *((int *)sse) = _mm_cvtsi128_si32(res); return _mm_extract_epi32(res, 1); }
uint32_t halfsiphash(const unsigned char key[16], const unsigned char *m, size_t len) { xmmi k,v02,v20,v13,v11,v33,mi; uint32_t last7; uint32_t lo, hi; size_t i, blocks; k = _mm_loadu_si128((xmmi *)(key + 0)); v02 = siphash_init[0].v; v13 = siphash_init[1].v; v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k)); v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k)); last7 = (len & 0xff) << 24; for (i = 0, blocks = (len & ~3); i < blocks; i += 4) { mi = _mm_loadl_epi64((xmmi *)(m + i)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() sipcompress() v02 = _mm_xor_si128(v02, mi); } switch (len - blocks) { case 3: last7 |= (uint32_t)m[i + 2] << 16; case 2: last7 |= (uint32_t)m[i + 1] << 8; case 1: last7 |= (uint32_t)m[i + 0] ; case 0: default:; }; mi = _mm_unpacklo_epi32(_mm_cvtsi32_si128(last7),_mm_cvtsi32_si128(0)); v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8)); sipcompress() sipcompress() v02 = _mm_xor_si128(v02, mi); v02 = _mm_xor_si128(v02, siphash_final.v); sipcompress() sipcompress() sipcompress() sipcompress() v02 = _mm_xor_si128(v02, v13); v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2))); lo = _mm_cvtsi128_si32(v02); return lo; }
int normL1_(const uchar* a, const uchar* b, int n) { int j = 0, d = 0; #if CV_SSE __m128i d0 = _mm_setzero_si128(); for( ; j <= n - 16; j += 16 ) { __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j)); __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } for( ; j <= n - 4; j += 4 ) { __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j)); __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j)); d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1)); } d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0))); #elif CV_NEON uint32x4_t v_sum = vdupq_n_u32(0.0f); for ( ; j <= n - 16; j += 16) { uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j)); uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst)); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high))); v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high))); } uint CV_DECL_ALIGNED(16) buf[4]; vst1q_u32(buf, v_sum); d = buf[0] + buf[1] + buf[2] + buf[3]; #endif { for( ; j <= n - 4; j += 4 ) { d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) + std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]); } } for( ; j < n; j++ ) d += std::abs(a[j] - b[j]); return d; }
inline int rand() { __m128i split; __m128i multi; __m128i adder; __m128i mmask; __m128i smask; __m128i store; DATA(multi)={0x000343FD,0x000043FD,0x000343FD,0x00010DCD}; DATA(adder)={0x00269EC3,0x009E9EC3,0x00D19EC3,0x00000001}; DATA(mmask)={0xFFFFFFFF,0x00000000,0xFFFFFFFF,0x00000000}; DATA(smask)={0x00007FFF,0x00007FFF,0x00007FFF,0x00007FFF}; #undef DATA adder = _mm_load_si128 ((__m128i*)data_adder); multi = _mm_load_si128 ((__m128i*)data_multi); mmask = _mm_load_si128 ((__m128i*)data_mmask); smask = _mm_load_si128 ((__m128i*)data_smask); split = _mm_shuffle_epi32( __ccaprice_stdlib_rseed, __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE ); __ccaprice_stdlib_rseed = _mm_mul_epu32(__ccaprice_stdlib_rseed, multi); multi = _mm_shuffle_epi32( multi, __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE ); split = _mm_mul_epu32(split, multi); __ccaprice_stdlib_rseed = _mm_and_si128(__ccaprice_stdlib_rseed, mmask); split = _mm_and_si128(split, mmask); split = _mm_shuffle_epi32( split, __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE ); __ccaprice_stdlib_rseed = _mm_or_si128 (__ccaprice_stdlib_rseed, split); __ccaprice_stdlib_rseed = _mm_add_epi32 (__ccaprice_stdlib_rseed, adder); store = _mm_srai_epi32(__ccaprice_stdlib_rseed, 0x10); store = _mm_and_si128 (store, smask); return (unsigned int)_mm_cvtsi128_si32(store); #undef __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS2 #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS1 }
INLINE static unsigned sum_block_avx2(__m128i *ver_row) { __m128i sad = _mm_setzero_si128(); haddwd_accumulate_avx2(&sad, ver_row + 0); haddwd_accumulate_avx2(&sad, ver_row + 1); haddwd_accumulate_avx2(&sad, ver_row + 2); haddwd_accumulate_avx2(&sad, ver_row + 3); haddwd_accumulate_avx2(&sad, ver_row + 4); haddwd_accumulate_avx2(&sad, ver_row + 5); haddwd_accumulate_avx2(&sad, ver_row + 6); haddwd_accumulate_avx2(&sad, ver_row + 7); sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1))); sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0))); return _mm_cvtsi128_si32(sad); }
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1, uint32_t c2) { const __m128i zero = _mm_setzero_si128(); const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero); const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero); const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero); const __m128i avg = _mm_add_epi16(C1, C0); const __m128i A0 = _mm_srli_epi16(avg, 1); const __m128i A1 = _mm_sub_epi16(A0, B0); const __m128i BgtA = _mm_cmpgt_epi16(B0, A0); const __m128i A2 = _mm_sub_epi16(A1, BgtA); const __m128i A3 = _mm_srai_epi16(A2, 1); const __m128i A4 = _mm_add_epi16(A0, A3); const __m128i A5 = _mm_packus_epi16(A4, A4); const uint32_t output = _mm_cvtsi128_si32(A5); return output; }
//lower - usually target //upper - usually source; its alpha decides how much of the lower color is visible //always does the full blending operation, does not optimize based on A=FF being true 90% of the time and A=00 90% of the remainder static inline uint32_t blend_8888_on_8888(uint32_t argb_lower, uint32_t argb_upper) { #ifdef __SSE2__ //no need to extend this above 128bit, it's complex enough without having to consider multiple pixels at once uint32_t spx = argb_upper; uint32_t tpx = argb_lower; //contains u16: spx.a, spx.b, spx.g, spx.r, tpx.{a,b,g,r} __m128i vals = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, spx, tpx), _mm_setzero_si128()); //contains u16: {sa}*4, {255-sa}*4 __m128i alphas = _mm_xor_si128(_mm_set1_epi16(spx>>24), _mm_set_epi16(0,0,0,0, 255,255,255,255)); //contains u16: pixel contributions times 255 __m128i newcols255 = _mm_mullo_epi16(vals, alphas); //ugly magic constants: (u16)*8081>>16>>7 = (u16)/255 __m128i newcols = _mm_srli_epi16(_mm_mulhi_epu16(newcols255, _mm_set1_epi16(0x8081)), 7); //contains u8: {don't care}*8, sac (source alpha contribution), sbc, sgc, src, tac, tbc, tgc, trc __m128i newpack = _mm_packus_epi16(newcols, _mm_undefined_si128()); //contains u8: {don't care}*12, sac+tac = result alpha, sbc+tbc, sgc+tgc, src+trc //the components are known to not overflow __m128i newpacksum = _mm_add_epi8(newpack, _mm_srli_si128(newpack, 32/8)); return _mm_cvtsi128_si32(newpacksum); #else uint8_t sr = argb_upper>>0; uint8_t sg = argb_upper>>8; uint8_t sb = argb_upper>>16; uint8_t sa = argb_upper>>24; uint8_t tr = argb_lower>>0; uint8_t tg = argb_lower>>8; uint8_t tb = argb_lower>>16; uint8_t ta = argb_lower>>24; tr = (sr*sa/255) + (tr*(255-sa)/255); tg = (sg*sa/255) + (tg*(255-sa)/255); tb = (sb*sa/255) + (tb*(255-sa)/255); ta = (sa*sa/255) + (ta*(255-sa)/255); return ta<<24 | tb<<16 | tg<<8 | tr<<0; #endif }
int countZeroBytes_SSE(char* values, int length) { int zeroCount = 0; __m128i zero16 = _mm_set1_epi8(0); __m128i and16 = _mm_set1_epi8(1); for(int i=0; i<length; i+=16) { __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]); __m128i cmp = _mm_cmpeq_epi8(values16, zero16); if(_mm_movemask_epi8(cmp)) { cmp = _mm_and_si128(and16, cmp); //change -1 values to 1 //hortiontal sum of 16 bytes __m128i sum1 = _mm_sad_epu8(cmp,zero16); __m128i sum2 = _mm_shuffle_epi32(sum1,2); __m128i sum3 = _mm_add_epi16(sum1,sum2); zeroCount += _mm_cvtsi128_si32(sum3); } } return zeroCount; }
/** * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more * precise version of a box filter 4:2:0 pixel subsampling in Q3. * * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the * active area is specified using width and height. * * Note: We don't need to worry about going over the active area, as long as we * stay inside the CfL prediction buffer. */ static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input, int input_stride, uint16_t *pred_buf_q3, int width, int height) { const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE; const int luma_stride = input_stride << 1; do { if (width == 4) { const __m128i top = _mm_loadl_epi64((__m128i *)input); const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); sum = _mm_hadd_epi16(sum, sum); *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum)); } else { const __m128i top = _mm_loadu_si128((__m128i *)input); const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride)); __m128i sum = _mm_add_epi16(top, bot); if (width == 8) { sum = _mm_hadd_epi16(sum, sum); _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); } else { const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1); const __m128i bot_1 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1); sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1)); _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum)); if (width == 32) { const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2); const __m128i bot_2 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2); const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3); const __m128i bot_3 = _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3); const __m128i sum_2 = _mm_add_epi16(top_2, bot_2); const __m128i sum_3 = _mm_add_epi16(top_3, bot_3); __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3); _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1, _mm_add_epi16(next_sum, next_sum)); } } } input += luma_stride; } while ((pred_buf_q3 += CFL_BUF_LINE) < end); }
//FINL int16 __ext_v_sum_int16(int16* x, int len) { __m128i msum = _mm_setzero_si128(); #ifdef SORA_PLATFORM __int16 ret; #else int16_t ret; #endif const int wlen = 8; for (int i = 0; i < len / wlen; i++) { __m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i)); msum = _mm_add_epi16(msum, mx); } __m128i mout = msum; msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi16(mout, msum); msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi16(mout, msum); msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3)); mout = _mm_add_epi16(mout, msum); unsigned int temp = _mm_cvtsi128_si32(mout); ret = temp; ret += (temp >> 16); for (int i = (len / wlen) * wlen; i < len; i++) { ret += x[i]; } return ret; }
static void aom_filter_block1d4_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); filt1Reg = _mm_load_si128((__m128i const *)(filtd4)); for (i = output_height; i > 0; i -= 1) { // load the 2 strides of source srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg); // multiply 4 adjacent elements with the filter and add the result srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters); srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128()); src_ptr += src_pixels_per_line; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1); output_ptr += output_pitch; } }
void bitund132(unsigned *p, unsigned n, unsigned x) { #ifdef __SSE2__ __m128i sv = _mm_set1_epi32(x), cv = _mm_set_epi32(4,3,2,1); unsigned *ip; for(ip = p; ip != p+(n&~(4-1)); ) { __m128i v = _mm_loadu_si128((__m128i *)ip); SCANI128_32(v, sv, cv); _mm_storeu_si128((__m128i *)ip, sv); ip += 4; } x = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); while(ip < p+n) { *ip = (x += (*ip) + 1); ip++; } #else BITUNDELTA(p, n, x, 1); #endif }
unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, unsigned inc) { #ifdef __SSE2__ unsigned *ip,b,*op = out; __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(inc), dv; for(ip = in; ip != in+(n&~(4-1)); ip += 4) { __m128i iv = _mm_loadu_si128((__m128i *)ip); bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128_32(iv,sv),cv)); sv = iv; _mm_storeu_si128((__m128i *)op, dv); op += 4; } start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12)); HOR128_32(bv, b); while(ip < in+n) { unsigned x = *ip-start-inc; start = *ip++; b |= x; *op++ = x; } #else typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x;*op++ = _x); #endif return bsr32(b); }
static INLINE unsigned int masked_sad4xh_ssse3( const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride, const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride, int height) { int y; __m128i res = _mm_setzero_si128(); const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS)); for (y = 0; y < height; y += 2) { // Load two rows at a time, this seems to be a bit faster // than four rows at a time in this case. const __m128i src = _mm_unpacklo_epi32( _mm_cvtsi32_si128(*(uint32_t *)src_ptr), _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride])); const __m128i a = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr), _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride])); const __m128i b = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr), _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride])); const __m128i m = _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr), _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride])); const __m128i m_inv = _mm_sub_epi8(mask_max, m); const __m128i data = _mm_unpacklo_epi8(a, b); const __m128i mask = _mm_unpacklo_epi8(m, m_inv); __m128i pred_16bit = _mm_maddubs_epi16(data, mask); pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS); const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128()); res = _mm_add_epi32(res, _mm_sad_epu8(pred, src)); src_ptr += src_stride * 2; a_ptr += a_stride * 2; b_ptr += b_stride * 2; m_ptr += m_stride * 2; } // At this point, the SAD is stored in lane 0 of 'res' int32_t sad = _mm_cvtsi128_si32(res); return (sad + 31) >> 6; }
void ie_FillLine(iePwL pDst, DWORD nXW, iewL clr) { #ifndef __X64__ if (g_bSSE2 && (nXW >= 16)) { #else if (nXW >= 16) { #endif // Do fill using SSE2! while (nXW) { // Fill until destination is aligned if (_mm_isAligned(pDst)) break; *pDst++ = clr; nXW--; } __m128i r0 = _mm_set1_epi16(clr); for (DWORD nXW_8 = nXW >> 3; nXW_8--;) { _mm_store_si128((__m128i *)pDst, r0); pDst += 8; } if (nXW & 4) { _mm_storel_epi64((__m128i *)pDst, r0); pDst += 4; } if (nXW & 2) { *PDWORD(pDst) = _mm_cvtsi128_si32(r0); pDst += 2; } if (nXW & 1) { *PWORD(pDst) = clr; } return; } while (nXW--) *pDst++ = clr; }
inline Pixel GetPixelSSE(const Image* img, float x, float y) { const int stride = img->width; const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel // Load the data (2 pixels in one load) __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); __m128 weight = CalcWeights(x, y); // extend to 16bit p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128()); p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128()); // convert floating point weights to 16bit integer weight = _mm_mul_ps(weight, CONST_256); __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1 weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit // prepare the weights __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0)); __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2)); w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3 // multiply each pixel with its weight (2 pixel per SSE mul) __m128i L12 = _mm_mullo_epi16(p12, w12); __m128i L34 = _mm_mullo_epi16(p34, w34); // sum the results __m128i L1234 = _mm_add_epi16(L12, L34); __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2)); __m128i L = _mm_add_epi16(L1234, Lhi); // convert back to 8bit __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256 L8 = _mm_packus_epi16(L8, _mm_setzero_si128()); // return return _mm_cvtsi128_si32(L8); }