static uint32_t utoa64_sse2(uint64_t value, char* buffer) { char* start = buffer; if (value < 100000000) { uint32_t v = static_cast<uint32_t>(value); if (v < 10000) { const uint32_t d1 = (v / 100) << 1; const uint32_t d2 = (v % 100) << 1; if (v >= 1000) *buffer++ = u_ctn2s[d1]; if (v >= 100) *buffer++ = u_ctn2s[d1+1]; if (v >= 10) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; return (buffer - start); } // Experiment shows that in this case SSE2 is slower # if 0 const __m128i a = Convert8DigitsSSE2(v); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; # ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); # else digit = __builtin_ctz(~mask | 0x8000); # endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 8 - digit - start); # else // value = bbbbcccc const uint32_t b = v / 10000; const uint32_t c = v % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100); const uint32_t d4 = (c % 100); if (value >= 10000000) *buffer++ = u_ctn2s[d1]; if (value >= 1000000) *buffer++ = u_ctn2s[d1+1]; if (value >= 100000) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; U_NUM2STR16(buffer, d3); U_NUM2STR16(buffer+2, d4); return (buffer + 4 - start); # endif } if (value < 10000000000000000) { const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); # ifdef _MSC_VER unsigned long digit; _BitScanForward(&digit, ~mask | 0x8000); # else unsigned digit = __builtin_ctz(~mask | 0x8000); # endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 16 - digit - start); } const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844 value %= 10000000000000000; if (a < 10) *buffer++ = '0' + (char)a; else if (a < 100) { U_NUM2STR16(buffer, a); buffer += 2; } else if (a < 1000) { *buffer++ = '0' + static_cast<char>(a / 100); const uint32_t i = (a % 100); U_NUM2STR16(buffer, i); buffer += 2; } else { const uint32_t i = (a / 100); const uint32_t j = (a % 100); U_NUM2STR16(buffer, i); U_NUM2STR16(buffer+2, j); buffer += 4; } const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va); return (buffer + 16 - start); }
// Does one or two inverse transforms. static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst, int do_two) { // This implementation makes use of 16-bit fixed point versions of two // multiply constants: // K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16 // K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16 // // To be able to use signed 16-bit integers, we use the following trick to // have constants within range: // - Associated constants are obtained by subtracting the 16-bit fixed point // version of one: // k = K - (1 << 16) => K = k + (1 << 16) // K1 = 85267 => k1 = 20091 // K2 = 35468 => k2 = -30068 // - The multiplication of a variable by a constant become the sum of the // variable and the multiplication of that variable by the associated // constant: // (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x const __m128i k1 = _mm_set1_epi16(20091); const __m128i k2 = _mm_set1_epi16(-30068); __m128i T0, T1, T2, T3; // Load and concatenate the transform coefficients (we'll do two inverse // transforms in parallel). In the case of only one inverse transform, the // second half of the vectors will just contain random value we'll never // use nor store. __m128i in0, in1, in2, in3; { in0 = _mm_loadl_epi64((const __m128i*)&in[0]); in1 = _mm_loadl_epi64((const __m128i*)&in[4]); in2 = _mm_loadl_epi64((const __m128i*)&in[8]); in3 = _mm_loadl_epi64((const __m128i*)&in[12]); // a00 a10 a20 a30 x x x x // a01 a11 a21 a31 x x x x // a02 a12 a22 a32 x x x x // a03 a13 a23 a33 x x x x if (do_two) { const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]); const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]); const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]); const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]); in0 = _mm_unpacklo_epi64(in0, inB0); in1 = _mm_unpacklo_epi64(in1, inB1); in2 = _mm_unpacklo_epi64(in2, inB2); in3 = _mm_unpacklo_epi64(in3, inB3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } } // Vertical pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i a = _mm_add_epi16(in0, in2); const __m128i b = _mm_sub_epi16(in0, in2); // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3 const __m128i c1 = _mm_mulhi_epi16(in1, k2); const __m128i c2 = _mm_mulhi_epi16(in3, k1); const __m128i c3 = _mm_sub_epi16(in1, in3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3 const __m128i d1 = _mm_mulhi_epi16(in1, k1); const __m128i d2 = _mm_mulhi_epi16(in3, k2); const __m128i d3 = _mm_add_epi16(in1, in3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1); const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3); const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1); const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Horizontal pass and subsequent transpose. { // First pass, c and d calculations are longer because of the "trick" // multiplications. const __m128i four = _mm_set1_epi16(4); const __m128i dc = _mm_add_epi16(T0, four); const __m128i a = _mm_add_epi16(dc, T2); const __m128i b = _mm_sub_epi16(dc, T2); // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3 const __m128i c1 = _mm_mulhi_epi16(T1, k2); const __m128i c2 = _mm_mulhi_epi16(T3, k1); const __m128i c3 = _mm_sub_epi16(T1, T3); const __m128i c4 = _mm_sub_epi16(c1, c2); const __m128i c = _mm_add_epi16(c3, c4); // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3 const __m128i d1 = _mm_mulhi_epi16(T1, k1); const __m128i d2 = _mm_mulhi_epi16(T3, k2); const __m128i d3 = _mm_add_epi16(T1, T3); const __m128i d4 = _mm_add_epi16(d1, d2); const __m128i d = _mm_add_epi16(d3, d4); // Second pass. const __m128i tmp0 = _mm_add_epi16(a, d); const __m128i tmp1 = _mm_add_epi16(b, c); const __m128i tmp2 = _mm_sub_epi16(b, c); const __m128i tmp3 = _mm_sub_epi16(a, d); const __m128i shifted0 = _mm_srai_epi16(tmp0, 3); const __m128i shifted1 = _mm_srai_epi16(tmp1, 3); const __m128i shifted2 = _mm_srai_epi16(tmp2, 3); const __m128i shifted3 = _mm_srai_epi16(tmp3, 3); // Transpose the two 4x4. // a00 a01 a02 a03 b00 b01 b02 b03 // a10 a11 a12 a13 b10 b11 b12 b13 // a20 a21 a22 a23 b20 b21 b22 b23 // a30 a31 a32 a33 b30 b31 b32 b33 const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1); const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3); const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1); const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3); // a00 a10 a01 a11 a02 a12 a03 a13 // a20 a30 a21 a31 a22 a32 a23 a33 // b00 b10 b01 b11 b02 b12 b03 b13 // b20 b30 b21 b31 b22 b32 b23 b33 const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1); const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3); const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1); const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3); // a00 a10 a20 a30 a01 a11 a21 a31 // b00 b10 b20 b30 b01 b11 b21 b31 // a02 a12 a22 a32 a03 a13 a23 a33 // b02 b12 a22 b32 b03 b13 b23 b33 T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1); T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1); T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3); T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3); // a00 a10 a20 a30 b00 b10 b20 b30 // a01 a11 a21 a31 b01 b11 b21 b31 // a02 a12 a22 a32 b02 b12 b22 b32 // a03 a13 a23 a33 b03 b13 b23 b33 } // Add inverse transform to 'ref' and store. { const __m128i zero = _mm_setzero_si128(); // Load the reference(s). __m128i ref0, ref1, ref2, ref3; if (do_two) { // Load eight bytes/pixels per line. ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]); ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]); ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]); ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]); } else { // Load four bytes/pixels per line. ref0 = _mm_cvtsi32_si128(*(const int*)&ref[0 * BPS]); ref1 = _mm_cvtsi32_si128(*(const int*)&ref[1 * BPS]); ref2 = _mm_cvtsi32_si128(*(const int*)&ref[2 * BPS]); ref3 = _mm_cvtsi32_si128(*(const int*)&ref[3 * BPS]); } // Convert to 16b. ref0 = _mm_unpacklo_epi8(ref0, zero); ref1 = _mm_unpacklo_epi8(ref1, zero); ref2 = _mm_unpacklo_epi8(ref2, zero); ref3 = _mm_unpacklo_epi8(ref3, zero); // Add the inverse transform(s). ref0 = _mm_add_epi16(ref0, T0); ref1 = _mm_add_epi16(ref1, T1); ref2 = _mm_add_epi16(ref2, T2); ref3 = _mm_add_epi16(ref3, T3); // Unsigned saturate to 8b. ref0 = _mm_packus_epi16(ref0, ref0); ref1 = _mm_packus_epi16(ref1, ref1); ref2 = _mm_packus_epi16(ref2, ref2); ref3 = _mm_packus_epi16(ref3, ref3); // Store the results. if (do_two) { // Store eight bytes/pixels per line. _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0); _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1); _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2); _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3); } else { // Store four bytes/pixels per line. *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0); *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1); *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2); *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3); } } }
void vp9_filter_block1d16_v8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pitch, unsigned char *output_ptr, unsigned int out_pitch, unsigned int output_height, int16_t *filter) { __m128i addFilterReg64, filtersReg, srcRegFilt1, srcRegFilt3; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt5, srcRegFilt6, srcRegFilt7, srcRegFilt8; __m128i srcReg1, srcReg2, srcReg3, srcReg4, srcReg5, srcReg6, srcReg7; __m128i srcReg8; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits in the filter secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits in the filter thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits in the filter forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); // load the first 7 rows of 16 bytes srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr)); srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch)); srcReg3 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 2)); srcReg4 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 3)); srcReg5 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 4)); srcReg6 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 5)); srcReg7 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 6)); for (i = 0; i < output_height; i++) { // load the last 16 bytes srcReg8 = _mm_loadu_si128((__m128i *)(src_ptr + src_pitch * 7)); // merge the result together srcRegFilt5 = _mm_unpacklo_epi8(srcReg1, srcReg2); srcRegFilt6 = _mm_unpacklo_epi8(srcReg7, srcReg8); srcRegFilt1 = _mm_unpackhi_epi8(srcReg1, srcReg2); srcRegFilt3 = _mm_unpackhi_epi8(srcReg7, srcReg8); // multiply 2 adjacent elements with the filter and add the result srcRegFilt5 = _mm_maddubs_epi16(srcRegFilt5, firstFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, forthFilters); srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, forthFilters); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, srcRegFilt6); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); // merge the result together srcRegFilt3 = _mm_unpacklo_epi8(srcReg3, srcReg4); srcRegFilt6 = _mm_unpackhi_epi8(srcReg3, srcReg4); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt6 = _mm_maddubs_epi16(srcRegFilt6, secondFilters); // merge the result together srcRegFilt7 = _mm_unpacklo_epi8(srcReg5, srcReg6); srcRegFilt8 = _mm_unpackhi_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result srcRegFilt7 = _mm_maddubs_epi16(srcRegFilt7, thirdFilters); srcRegFilt8 = _mm_maddubs_epi16(srcRegFilt8, thirdFilters); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, _mm_min_epi16(srcRegFilt3, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_min_epi16(srcRegFilt6, srcRegFilt8)); // add and saturate the results together srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, _mm_max_epi16(srcRegFilt3, srcRegFilt7)); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, _mm_max_epi16(srcRegFilt6, srcRegFilt8)); srcRegFilt5 = _mm_adds_epi16(srcRegFilt5, addFilterReg64); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt5 = _mm_srai_epi16(srcRegFilt5, 7); srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1 = _mm_packus_epi16(srcRegFilt5, srcRegFilt1); src_ptr+=src_pitch; // shift down a row srcReg1 = srcReg2; srcReg2 = srcReg3; srcReg3 = srcReg4; srcReg4 = srcReg5; srcReg5 = srcReg6; srcReg6 = srcReg7; srcReg7 = srcReg8; // save 16 bytes convolve result _mm_store_si128((__m128i*)output_ptr, srcRegFilt1); output_ptr+=out_pitch; } }
void GetMinMaxColors_Intrinsics( const byte *colorBlock, byte *minColor, byte *maxColor ) { __m128i t0, t1, t3, t4, t6, t7; // get bounding box // ---------------- // load the first row t0 = _mm_load_si128 ( (__m128i*) colorBlock ); t1 = _mm_load_si128 ( (__m128i*) colorBlock ); __m128i t16 = _mm_load_si128 ( (__m128i*) (colorBlock+16) ); // Minimum of Packed Unsigned Byte Integers t0 = _mm_min_epu8 ( t0, t16); // Maximum of Packed Unsigned Byte Integers t1 = _mm_max_epu8 ( t1, t16); __m128i t32 = _mm_load_si128 ( (__m128i*) (colorBlock+32) ); t0 = _mm_min_epu8 ( t0, t32); t1 = _mm_max_epu8 ( t1, t32); __m128i t48 = _mm_load_si128 ( (__m128i*) (colorBlock+48) ); t0 = _mm_min_epu8 ( t0, t48); t1 = _mm_max_epu8 ( t1, t48); // Shuffle Packed Doublewords t3 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t4 = _mm_shuffle_epi32( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t3); t1 = _mm_max_epu8 ( t1, t4); // Shuffle Packed Low Words t6 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t7 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 2, 3, 2, 3 ) ); t0 = _mm_min_epu8 ( t0, t6); t1 = _mm_max_epu8 ( t1, t7); // inset the bounding box // ---------------------- // Unpack Low Data //__m128i t66 = _mm_set1_epi8( 0 ); __m128i t66 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_0 ); t0 = _mm_unpacklo_epi8(t0, t66); t1 = _mm_unpacklo_epi8(t1, t66); // copy (movdqa) //__m128i t2 = _mm_load_si128 ( &t1 ); __m128i t2 = t1; // Subtract Packed Integers t2 = _mm_sub_epi16(t2, t0); // Shift Packed Data Right Logical t2 = _mm_srli_epi16(t2, INSET_SHIFT); // Add Packed Integers t0 = _mm_add_epi16(t0, t2); t1 = _mm_sub_epi16(t1, t2); // Pack with Unsigned Saturation t0 = _mm_packus_epi16(t0, t0); t1 = _mm_packus_epi16(t1, t1); // store bounding box extents // -------------------------- _mm_store_si128 ( (__m128i*) minColor, t0 ); _mm_store_si128 ( (__m128i*) maxColor, t1 ); }
mlib_status __mlib_ImageBlendRGBA2ARGB( mlib_image *dst, const mlib_image *src) { mlib_type type; mlib_u8 *sl, *dl; mlib_s32 slb, dlb, nchan, width, height; mlib_s32 i, j, ii, off; P_TYPE *sp, *dp; P_TYPE ss, aa, ds, dd, d_h, d_l; P_TYPE mzero, const255, mask64, d_half; MLIB_IMAGE_CHECK(dst); MLIB_IMAGE_CHECK(src); MLIB_IMAGE_FULL_EQUAL(dst, src); MLIB_IMAGE_GET_ALL_PARAMS(dst, type, nchan, width, height, dlb, dl); slb = mlib_ImageGetStride(src); sl = mlib_ImageGetData(src); if (type != MLIB_BYTE || nchan != 4) { return (MLIB_FAILURE); } mzero = _mm_setzero_si128(); const255 = _mm_set1_epi32(0x00ff00ff); mask64 = _mm_set1_epi32(0xffffff00); d_half = _mm_set1_epi32(0x00800080); for (j = 0; j < height; j++) { P_TYPE alp, a0, a1, ralp, s0, s1, d0, d1, drnd; mlib_m128 s0u, s1u; sp = (void *)sl; dp = (void *)dl; if (!(((mlib_s32)sp | (mlib_s32)dp) & 15)) { for (i = 0; i < (width / 4); i++) { ss = _mm_load_si128(sp); dd = _mm_load_si128(dp); s0 = _mm_unpacklo_epi8(ss, mzero); a0 = _mm_shufflelo_epi16(s0, 0xff); a0 = _mm_shufflehi_epi16(a0, 0xff); s0 = _mm_shufflelo_epi16(s0, 0x93); s0 = _mm_shufflehi_epi16(s0, 0x93); BLEND(d_h, a0, s0, _mm_unpacklo_epi8(dd, mzero)); s1 = _mm_unpackhi_epi8(ss, mzero); a1 = _mm_shufflelo_epi16(s1, 0xff); a1 = _mm_shufflehi_epi16(a1, 0xff); s1 = _mm_shufflelo_epi16(s1, 0x93); s1 = _mm_shufflehi_epi16(s1, 0x93); BLEND(d_l, a1, s1, _mm_unpackhi_epi8(dd, mzero)); d_h = _mm_packus_epi16(d_h, d_l); d_h = _mm_or_si128(_mm_and_si128(mask64, d_h), _mm_andnot_si128(mask64, dd)); _mm_store_si128(dp, d_h); sp++; dp++; } } else { for (i = 0; i < (width / 4); i++) { #if 0 ss = _mm_loadu_si128(sp); s0 = _mm_unpacklo_epi8(ss, mzero); s1 = _mm_unpackhi_epi8(ss, mzero); #else s0u.m128d = _mm_load_sd((mlib_d64 *)sp); s1u.m128d = _mm_load_sd((mlib_d64 *)sp + 1); s0 = _mm_unpacklo_epi8(s0u.m128i, mzero); s1 = _mm_unpacklo_epi8(s1u.m128i, mzero); #endif dd = _mm_loadu_si128(dp); a0 = _mm_shufflelo_epi16(s0, 0xff); a0 = _mm_shufflehi_epi16(a0, 0xff); s0 = _mm_shufflelo_epi16(s0, 0x93); s0 = _mm_shufflehi_epi16(s0, 0x93); BLEND(d_h, a0, s0, _mm_unpacklo_epi8(dd, mzero)); a1 = _mm_shufflelo_epi16(s1, 0xff); a1 = _mm_shufflehi_epi16(a1, 0xff); s1 = _mm_shufflelo_epi16(s1, 0x93); s1 = _mm_shufflehi_epi16(s1, 0x93); BLEND(d_l, a1, s1, _mm_unpackhi_epi8(dd, mzero)); d_h = _mm_packus_epi16(d_h, d_l); d_h = _mm_or_si128(_mm_and_si128(mask64, d_h), _mm_andnot_si128(mask64, dd)); #if 1 _mm_storeu_si128(dp, d_h); #else s0u.m128i = d_h; s1u.m128i = _mm_shuffle_epi32(d_h, 0x3e); _mm_store_sd((mlib_d64 *)dp, s0u.m128d); _mm_store_sd((mlib_d64 *)dp + 1, s1u.m128d); #endif sp++; dp++; } } if (width & 3) { s0u.m128d = _mm_load_sd((mlib_d64 *)sp); s1u.m128d = _mm_load_sd((mlib_d64 *)sp + 1); s0 = _mm_unpacklo_epi8(s0u.m128i, mzero); s1 = _mm_unpacklo_epi8(s1u.m128i, mzero); dd = _mm_loadu_si128(dp); a0 = _mm_shufflelo_epi16(s0, 0xff); a0 = _mm_shufflehi_epi16(a0, 0xff); s0 = _mm_shufflelo_epi16(s0, 0x93); s0 = _mm_shufflehi_epi16(s0, 0x93); BLEND(d_h, a0, s0, _mm_unpacklo_epi8(dd, mzero)); a1 = _mm_shufflelo_epi16(s1, 0xff); a1 = _mm_shufflehi_epi16(a1, 0xff); s1 = _mm_shufflelo_epi16(s1, 0x93); s1 = _mm_shufflehi_epi16(s1, 0x93); BLEND(d_l, a1, s1, _mm_unpackhi_epi8(dd, mzero)); d_h = _mm_packus_epi16(d_h, d_l); d_h = _mm_or_si128(_mm_and_si128(mask64, d_h), _mm_andnot_si128(mask64, dd)); for (ii = 0; ii < (width & 3); ii++) { ((mlib_s32 *)dp)[ii] = ((mlib_s32 *)&d_h)[ii]; } } sl += slb; dl += dlb; } return (MLIB_SUCCESS); }
inline void u64toa_sse2(uint64_t value, char* buffer) { if (value < 100000000) { uint32_t v = static_cast<uint32_t>(value); if (v < 10000) { const uint32_t d1 = (v / 100) << 1; const uint32_t d2 = (v % 100) << 1; if (v >= 1000) *buffer++ = gDigitsLut[d1]; if (v >= 100) *buffer++ = gDigitsLut[d1 + 1]; if (v >= 10) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = '\0'; } else { // Experiment shows that this case SSE2 is slower #if 0 const __m128i a = Convert8DigitsSSE2(v); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; #ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); #else digit = __builtin_ctz(~mask | 0x8000); #endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); buffer[8 - digit] = '\0'; #else // value = bbbbcccc const uint32_t b = v / 10000; const uint32_t c = v % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100) << 1; const uint32_t d4 = (c % 100) << 1; if (value >= 10000000) *buffer++ = gDigitsLut[d1]; if (value >= 1000000) *buffer++ = gDigitsLut[d1 + 1]; if (value >= 100000) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = gDigitsLut[d3]; *buffer++ = gDigitsLut[d3 + 1]; *buffer++ = gDigitsLut[d4]; *buffer++ = gDigitsLut[d4 + 1]; *buffer++ = '\0'; #endif } } else if (value < 10000000000000000) { const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; #ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); #else digit = __builtin_ctz(~mask | 0x8000); #endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), result); buffer[16 - digit] = '\0'; } else { const uint32_t a = static_cast<uint32_t>(value / 10000000000000000); // 1 to 1844 value %= 10000000000000000; if (a < 10) *buffer++ = '0' + static_cast<char>(a); else if (a < 100) { const uint32_t i = a << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; } else if (a < 1000) { *buffer++ = '0' + static_cast<char>(a / 100); const uint32_t i = (a % 100) << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; } else { const uint32_t i = (a / 100) << 1; const uint32_t j = (a % 100) << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; *buffer++ = gDigitsLut[j]; *buffer++ = gDigitsLut[j + 1]; } const uint32_t v0 = static_cast<uint32_t>(value / 100000000); const uint32_t v1 = static_cast<uint32_t>(value % 100000000); const __m128i a0 = Convert8DigitsSSE2(v0); const __m128i a1 = Convert8DigitsSSE2(v1); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a0, a1), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); _mm_storeu_si128(reinterpret_cast<__m128i*>(buffer), va); buffer[16] = '\0'; } }
int operator()(int** src, uchar* dst, int, int width) const { if( !checkHardwareSupport(CV_CPU_SSE2) ) return 0; int x = 0; const int *row0 = src[0], *row1 = src[1], *row2 = src[2], *row3 = src[3], *row4 = src[4]; __m128i delta = _mm_set1_epi16(128); for( ; x <= width - 16; x += 16 ) { __m128i r0, r1, r2, r3, r4, t0, t1; r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), _mm_load_si128((const __m128i*)(row0 + x + 4))); r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), _mm_load_si128((const __m128i*)(row1 + x + 4))); r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), _mm_load_si128((const __m128i*)(row2 + x + 4))); r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), _mm_load_si128((const __m128i*)(row3 + x + 4))); r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), _mm_load_si128((const __m128i*)(row4 + x + 4))); r0 = _mm_add_epi16(r0, r4); r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); t0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x + 8)), _mm_load_si128((const __m128i*)(row0 + x + 12))); r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x + 8)), _mm_load_si128((const __m128i*)(row1 + x + 12))); r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x + 8)), _mm_load_si128((const __m128i*)(row2 + x + 12))); r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x + 8)), _mm_load_si128((const __m128i*)(row3 + x + 12))); r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x + 8)), _mm_load_si128((const __m128i*)(row4 + x + 12))); r0 = _mm_add_epi16(r0, r4); r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); t1 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); t0 = _mm_srli_epi16(_mm_add_epi16(t0, delta), 8); t1 = _mm_srli_epi16(_mm_add_epi16(t1, delta), 8); _mm_storeu_si128((__m128i*)(dst + x), _mm_packus_epi16(t0, t1)); } for( ; x <= width - 4; x += 4 ) { __m128i r0, r1, r2, r3, r4, z = _mm_setzero_si128(); r0 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row0 + x)), z); r1 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row1 + x)), z); r2 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row2 + x)), z); r3 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row3 + x)), z); r4 = _mm_packs_epi32(_mm_load_si128((const __m128i*)(row4 + x)), z); r0 = _mm_add_epi16(r0, r4); r1 = _mm_add_epi16(_mm_add_epi16(r1, r3), r2); r0 = _mm_add_epi16(r0, _mm_add_epi16(r2, r2)); r0 = _mm_add_epi16(r0, _mm_slli_epi16(r1, 2)); r0 = _mm_srli_epi16(_mm_add_epi16(r0, delta), 8); *(int*)(dst + x) = _mm_cvtsi128_si32(_mm_packus_epi16(r0, r0)); } return x; }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp) { uint8_t *p0 = buff + 16; uint8_t *p1 = p0 + bstride; uint8_t *p2 = p1 + bstride; uint8_t *p3 = p2 + bstride; uint8_t *p4 = p3 + bstride; uint8_t *orig = p0, *end = p4; line_copy8(p0, srcp + 2 * stride , width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128 rdiv = _mm_set1_ps((float)ch->rdiv); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix[25]; for (int i = 0; i < 25; i++) { matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t *array[] = { p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2, p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2, p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2, p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2, p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2 }; for (int x = 0; x < width; x += 16) { __m128i sum[4] = { zero, zero, zero, zero }; for (int i = 0; i < 25; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x)); xmm2 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); xmm1 = _mm_unpackhi_epi16(xmm0, zero); xmm0 = _mm_unpacklo_epi16(xmm0, zero); sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i])); sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i])); xmm1 = _mm_unpackhi_epi16(xmm2, zero); xmm0 = _mm_unpacklo_epi16(xmm2, zero); sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i])); sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i])); } for (int i = 0; i < 4; i++) { __m128 sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); sumfp = _mm_add_ps(sumfp, bias); if (!ch->saturate) { sumfp = mm_abs_ps(sumfp); } sum[i] = _mm_cvttps_epi32(sumfp); } sum[0] = _mm_packs_epi32(sum[0], sum[1]); sum[1] = _mm_packs_epi32(sum[2], sum[3]); sum[0] = _mm_packus_epi16(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static void aom_filter_block1d4_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32; __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45, srcReg6, srcReg56; __m128i srcReg23_34_lo, srcReg45_56_lo; __m128i srcReg2345_3456_lo, srcReg2345_3456_hi; __m128i resReglo, resReghi; __m128i firstFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3); srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4); srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5); srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6); // merge every two consecutive registers srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56); srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo); srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo); // multiply 2 adjacent elements with the filter and add the result resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters); resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters); resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128()); resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128()); // shift by 6 bit each 16 bit resReglo = _mm_adds_epi16(resReglo, addFilterReg32); resReghi = _mm_adds_epi16(resReghi, addFilterReg32); resReglo = _mm_srai_epi16(resReglo, 6); resReghi = _mm_srai_epi16(resReghi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReglo = _mm_packus_epi16(resReglo, resReglo); resReghi = _mm_packus_epi16(resReghi, resReghi); src_ptr += src_stride; *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo); *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_34_lo = srcReg45_56_lo; srcReg4 = srcReg6; } }
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
* \param ref_main Reference pixels * \param delta_pos Fractional pixel precise position of sample displacement * \param x Sample offset in direction x in ref_main array */ static INLINE __m128i filter_4x1_avx2(const kvz_pixel *ref_main, int16_t delta_pos, int x){ int8_t delta_int = delta_pos >> 5; int8_t delta_fract = delta_pos & (32-1); __m128i sample0 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int])); __m128i sample1 = _mm_cvtsi32_si128(*(uint32_t*)&(ref_main[x + delta_int + 1])); __m128i pairs = _mm_unpacklo_epi8(sample0, sample1); __m128i weight = _mm_set1_epi16( (delta_fract << 8) | (32 - delta_fract) ); sample0 = _mm_maddubs_epi16(pairs, weight); sample0 = _mm_add_epi16(sample0, _mm_set1_epi16(16)); sample0 = _mm_srli_epi16(sample0, 5); sample0 = _mm_packus_epi16(sample0, sample0); return sample0; } /** * \brief Linear interpolation for 4x4 block. Writes filtered 4x4 block to dst. * \param dst Destination buffer * \param ref_main Reference pixels * \param sample_disp Sample displacement per row * \param vertical_mode Mode direction, true if vertical */ static void filter_4x4_avx2(kvz_pixel *dst, const kvz_pixel *ref_main, int sample_disp, bool vertical_mode){ __m128i row0 = filter_4x1_avx2(ref_main, 1 * sample_disp, 0); __m128i row1 = filter_4x1_avx2(ref_main, 2 * sample_disp, 0);
void tuned_ConvertULY4ToRGB(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pYBegin, const uint8_t *pUBegin, const uint8_t *pVBegin, size_t cbWidth, ssize_t scbStride) { const int shift = 13; __m128i xy2rgb = _mm_set2_epi16_shift((-16 * C::Y2RGB + 0.5) / 0xff, C::Y2RGB, shift); __m128i vu2r = _mm_set2_epi16_shift(C::V2R, 0, shift); __m128i vu2g = _mm_set2_epi16_shift(C::V2G, C::U2G, shift); __m128i vu2b = _mm_set2_epi16_shift(0, C::U2B, shift); auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pDstBegin; p != pDstEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP * 4) { __m128i yy = _mm_cvtsi32_si128(*(const int *)y); __m128i uu = _mm_cvtsi32_si128(*(const int *)u); __m128i vv = _mm_cvtsi32_si128(*(const int *)v); __m128i xy = _mm_unpacklo_epi8(_mm_unpacklo_epi8(yy, _mm_setone_si128()), _mm_setzero_si128()); // 00 ff 00 Y3 00 ff 00 Y2 00 ff 00 Y1 00 ff 00 Y0 __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); _mm_storeu_si128((__m128i *)pp, bgrx); } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, rr)); __m128i bgr = _mm_shuffle_epi8(bgrx, _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0)); _mm_storeu_si128((__m128i *)pp, bgr); } #endif else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); _mm_storeu_si128((__m128i *)pp, xrgb); } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_setone_si128(), rr), _mm_unpacklo_epi8(gg, bb)); __m128i rgb = _mm_shuffle_epi8(xrgb, _mm_set_epi8(-1, -1, -1, -1, 15, 14, 13, 11, 10, 9, 7, 6, 5, 3, 2, 1)); _mm_storeu_si128((__m128i *)pp, rgb); } #endif y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i xy = _mm_cvtsi32_si128(*y | 0x00ff0000); __m128i uu = _mm_cvtsi32_si128(*u); __m128i vv = _mm_cvtsi32_si128(*v); __m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0 vu = _mm_sub_epi16(vu, _mm_set1_epi16(128)); __m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb); auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i { __m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb)); rgb = _mm_srai_epi32(rgb, shift); rgb = _mm_packs_epi32(rgb, rgb); rgb = _mm_packus_epi16(rgb, rgb); return rgb; }; __m128i rr = xyuv2rgb(vu2r); __m128i gg = xyuv2rgb(vu2g); __m128i bb = xyuv2rgb(vu2b); if (std::is_same<T, CBGRAColorOrder>::value) { __m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128())); *(uint32_t *)pp = _mm_cvtsi128_si32(bgrx); } else if (std::is_same<T, CARGBColorOrder>::value) { __m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb)); *(uint32_t *)pp = _mm_cvtsi128_si32(xrgb); } else if (std::is_same<T, CBGRColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { *(pp + T::B) = (uint8_t)_mm_cvtsi128_si32(bb); *(pp + T::G) = (uint8_t)_mm_cvtsi128_si32(gg); *(pp + T::R) = (uint8_t)_mm_cvtsi128_si32(rr); } y += 1; u += 1; v += 1; } } }
void tuned_ConvertRGBToULY4(uint8_t *pYBegin, uint8_t *pUBegin, uint8_t *pVBegin, const uint8_t *pSrcBegin, const uint8_t *pSrcEnd, size_t cbWidth, ssize_t scbStride) { const int shift = 14; __m128i rb2y, xg2y, rb2u, xg2u, rb2v, xg2v; if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value) { rb2y = _mm_set2_epi16_shift(C::R2Y, C::B2Y, shift); xg2y = _mm_set2_epi16_shift(16.5 / 0xff, C::G2Y, shift); rb2u = _mm_set2_epi16_shift(C::R2U, C::B2U, shift); xg2u = _mm_set2_epi16_shift(128.5 / 0xff, C::G2U, shift); rb2v = _mm_set2_epi16_shift(C::R2V, C::B2V, shift); xg2v = _mm_set2_epi16_shift(128.5 / 0xff, C::G2V, shift); } else { rb2y = _mm_set2_epi16_shift(C::B2Y, C::R2Y, shift); xg2y = _mm_set2_epi16_shift(C::G2Y, 16.5 / 0xff, shift); rb2u = _mm_set2_epi16_shift(C::B2U, C::R2U, shift); xg2u = _mm_set2_epi16_shift(C::G2U, 128.5 / 0xff, shift); rb2v = _mm_set2_epi16_shift(C::B2V, C::R2V, shift); xg2v = _mm_set2_epi16_shift(C::G2V, 128.5 / 0xff, shift); } auto y = pYBegin; auto u = pUBegin; auto v = pVBegin; for (auto p = pSrcBegin; p != pSrcEnd; p += scbStride) { auto pp = p; for (; pp <= p + cbWidth - 16; pp += T::BYPP*4) { __m128i m = _mm_loadu_si128((const __m128i *)pp); __m128i rb, xg; if (std::is_same<T, CBGRAColorOrder>::value) { // m = XX R3 G3 B3 XX R2 G2 B2 XX R1 G1 B1 XX R0 G0 B0 rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0 xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 } #ifdef __SSSE3__ else if (std::is_same<T, CBGRColorOrder>::value) { // m = XX XX XX XX R3 G3 B3 R2 G2 B2 R1 G1 B1 R0 G0 B0 rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 R3 00 B3 00 R2 00 B2 00 R1 00 B1 00 R0 00 B0 xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, -1, -1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1)), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 } #endif else if (std::is_same<T, CARGBColorOrder>::value) { // m = B3 G3 R3 XX B2 G2 R2 XX B1 G1 R1 XX B0 G0 R0 XX rb = _mm_srli_epi16(m, 8); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0 xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff } #ifdef __SSSE3__ else if (std::is_same<T, CRGBColorOrder>::value) { // m = XX XX XX XX B3 G3 R3 B2 G2 R2 B1 G1 R1 B0 G0 R0 rb = _mm_shuffle_epi8(m, _mm_set_epi8(-1, 11, -1, 9, -1, 8, -1, 6, -1, 5, -1, 3, -1, 2, -1, 0)); // 00 B3 00 R3 00 B2 00 R2 00 B1 00 R1 00 B0 00 R0 xg = _mm_or_si128(_mm_shuffle_epi8(m, _mm_set_epi8(-1, 10, -1, -1, -1, 7, -1, -1, -1, 4, -1, -1, -1, 1, -1, -1)), _mm_set1_epi32(0x000000ff)); // 00 G3 00 ff 00 G2 00 ff 00 G1 00 ff 00 G0 00 ff } #endif auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint32_t { __m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv)); yuv = _mm_srli_epi32(yuv, shift); #ifdef __SSSE3__ if (F >= CODEFEATURE_SSSE3) { yuv = _mm_shuffle_epi8(yuv, _mm_set_epi8(-1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 12, 8, 4, 0)); } else #endif { yuv = _mm_packs_epi32(yuv, yuv); yuv = _mm_packus_epi16(yuv, yuv); } return _mm_cvtsi128_si32(yuv); }; *(uint32_t *)y = xrgb2yuv(rb2y, xg2y); *(uint32_t *)u = xrgb2yuv(rb2u, xg2u); *(uint32_t *)v = xrgb2yuv(rb2v, xg2v); y += 4; u += 4; v += 4; } for (; pp < p + cbWidth; pp += T::BYPP) { __m128i m; __m128i rb, xg; if (std::is_same<T, CBGRAColorOrder>::value || std::is_same<T, CBGRColorOrder>::value) { if (std::is_same<T, CBGRAColorOrder>::value) { m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 } else { m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX R0 G0 B0 XX m = _mm_srli_epi32(m, 8); } rb = _mm_and_si128(m, _mm_set1_epi16(0x00ff)); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 R0 00 B0 xg = _mm_or_si128(_mm_srli_epi16(m, 8), _mm_set1_epi32(0x00ff0000)); // 00 ff 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 } else if (std::is_same<T, CARGBColorOrder>::value || std::is_same<T, CRGBColorOrder>::value) { if (std::is_same<T, CARGBColorOrder>::value) { m = _mm_cvtsi32_si128(*(const uint32_t *)pp); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX } else { m = _mm_cvtsi32_si128(*(const uint32_t *)(pp - 1)); // m = XX XX XX XX XX XX XX XX XX XX XX XX B0 G0 R0 XX } rb = _mm_srli_epi16(m, 8); // 00 XX 00 XX 00 XX 00 XX 00 XX 00 XX 00 B0 00 R0 xg = _mm_or_si128(_mm_and_si128(m, _mm_set1_epi32(0x00ff0000)), _mm_set1_epi32(0x000000ff)); // 00 XX 00 ff 00 XX 00 ff 00 XX 00 ff 00 G0 00 ff } auto xrgb2yuv = [rb, xg, shift](__m128i rb2yuv, __m128i xg2yuv) -> uint8_t { __m128i yuv = _mm_add_epi32(_mm_madd_epi16(rb, rb2yuv), _mm_madd_epi16(xg, xg2yuv)); yuv = _mm_srli_epi32(yuv, shift); return (uint8_t)_mm_cvtsi128_si32(yuv); }; *y = xrgb2yuv(rb2y, xg2y); *u = xrgb2yuv(rb2u, xg2u); *v = xrgb2yuv(rb2v, xg2v); y++; u++; v++; } } }
static uint32_t utoa32_sse2(uint32_t value, char* buffer) { char* start = buffer; if (value < 10000) { const uint32_t d1 = (value / 100) << 1; const uint32_t d2 = (value % 100) << 1; if (value >= 1000) *buffer++ = u_ctn2s[d1]; if (value >= 100) *buffer++ = u_ctn2s[d1+1]; if (value >= 10) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; return (buffer - start); } if (value < 100000000) { // Experiment shows that in this case SSE2 is slower # if 0 const __m128i a = Convert8DigitsSSE2(value); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; # ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); # else digit = __builtin_ctz(~mask | 0x8000); # endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8)); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 8 - digit - start); # else // value = bbbbcccc const uint32_t b = value / 10000; const uint32_t c = value % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100); const uint32_t d4 = (c % 100); if (value >= 10000000) *buffer++ = u_ctn2s[d1]; if (value >= 1000000) *buffer++ = u_ctn2s[d1+1]; if (value >= 100000) *buffer++ = u_ctn2s[d2]; *buffer++ = u_ctn2s[d2+1]; U_NUM2STR16(buffer, d3); U_NUM2STR16(buffer+2, d4); return (buffer + 4 - start); # endif } // value = aabbbbbbbb in decimal const uint32_t a = value / 100000000; // 1 to 42 value %= 100000000; if (a < 10) *buffer++ = '0' + (char)a; else { U_NUM2STR16(buffer, a); buffer += 2; } const __m128i b = Convert8DigitsSSE2(value); const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); const __m128i result = _mm_srli_si128(ba, 8); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); return (buffer + 8 - start); }
mlib_status __mlib_ImageBlend_OMSC_SAS( mlib_image *dst, const mlib_image *src1, const mlib_image *src2, mlib_s32 cmask) { mlib_s32 src_alpha, dst_alpha; mlib_s32 min; BLEND_VALIDATE; if (channels == 3) return (__mlib_ImageBlend_OMSC_ZERO(dst, src1, src2, cmask)); mlib_s32 d_s0, d_s1, d_s2, d_s3; int k; __m128i *px, *py, *pz; __m128i dx, dy; /* upper - 1 lower - 0 */ __m128i dx_1, dx_0, dy_1, dy_0, dz_1, dz_0; __m128i dall_zero; __m128i df_f = _mm_set1_epi32(0x00ff00ff); __m128i done_one = _mm_set1_epi32(0x00010001); dall_zero = _mm_setzero_si128(); if (cmask == 8) { if (0 == (((((mlib_addr) psrc1 | (mlib_addr)psrc2 | (mlib_addr)pdst)) & 0xf)) && (0 == (((src1_stride | src2_stride | dst_stride) & 0xf) || (1 == dst_height)))) { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_load_si128(px); dy = _mm_load_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_8(dx_1, dy_1, dz_1); PROCESS_DATA_8(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_store_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_8; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } else { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_loadu_si128(px); dy = _mm_loadu_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_8(dx_1, dy_1, dz_1); PROCESS_DATA_8(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_storeu_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_8; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } } else { if (0 == (((((mlib_addr) psrc1 | (mlib_addr)psrc2 | (mlib_addr)pdst)) & 0xf)) && (0 == (((src1_stride | src2_stride | dst_stride) & 0xf) || (1 == dst_height)))) { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_load_si128(px); dy = _mm_load_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_1(dx_1, dy_1, dz_1); PROCESS_DATA_1(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_store_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_1; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } else { for (j = 0; j < dst_height; j++) { px = (__m128i *)psrc1; py = (__m128i *)psrc2; pz = (__m128i *)pdst; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i <= dst_width - 4; i += 4) { dx = _mm_loadu_si128(px); dy = _mm_loadu_si128(py); UNPACK_UNSIGN_BYTE; PROCESS_DATA_1(dx_1, dy_1, dz_1); PROCESS_DATA_1(dx_0, dy_0, dz_0); dz_0 = _mm_packus_epi16(dz_0, dz_1); _mm_storeu_si128(pz, dz_0); px++; py++; pz++; } #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ DO_REST_1; psrc1 += src1_stride; psrc2 += src2_stride; pdst += dst_stride; } } } return (MLIB_SUCCESS); }
void aom_filter_block1d8_h8_intrin_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i firstFilters, secondFilters, thirdFilters, forthFilters, srcReg; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((const __m128i *)(src_ptr - 3)); // filter the source buffer srcRegFilt1 = _mm_shuffle_epi8(srcReg, filt1Reg); srcRegFilt2 = _mm_shuffle_epi8(srcReg, filt2Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // filter the source buffer srcRegFilt3 = _mm_shuffle_epi8(srcReg, filt3Reg); srcRegFilt4 = _mm_shuffle_epi8(srcReg, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, thirdFilters); srcRegFilt4 = _mm_maddubs_epi16(srcRegFilt4, forthFilters); // add and saturate all the results together minReg = _mm_min_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt2 = _mm_max_epi16(srcRegFilt2, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr += src_pixels_per_line; // save only 8 bytes _mm_storel_epi64((__m128i *)&output_ptr[0], srcRegFilt1); output_ptr += output_pitch; } }
IV SSE2( RegFile & r ) const { __m128i res = _mm_add_epi16( r.src1[0].i, mV128.i); r.dst[0].i = _mm_packus_epi16(res,res); }
static void aom_filter_block1d16_h4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr, ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i addFilterReg32, filt2Reg, filt3Reg; __m128i secondFilters, thirdFilters; __m128i srcRegFilt32b1_1, srcRegFilt32b2_1, srcRegFilt32b2, srcRegFilt32b3; __m128i srcReg32b1, srcReg32b2; unsigned int i; src_ptr -= 3; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); filtersReg = _mm_srai_epi16(filtersReg, 1); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 256 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 256 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); filt2Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32)); filt3Reg = _mm_load_si128((__m128i const *)(filt_h4 + 32 * 2)); for (i = output_height; i > 0; i -= 1) { srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b1, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // reading stride of the next 16 bytes // (part of it was being read by earlier read) srcReg32b2 = _mm_loadu_si128((const __m128i *)(src_ptr + 8)); // filter the source buffer srcRegFilt32b3 = _mm_shuffle_epi8(srcReg32b2, filt2Reg); srcRegFilt32b2 = _mm_shuffle_epi8(srcReg32b2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt32b3 = _mm_maddubs_epi16(srcRegFilt32b3, secondFilters); srcRegFilt32b2 = _mm_maddubs_epi16(srcRegFilt32b2, thirdFilters); // add and saturate the results together srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b3, srcRegFilt32b2); // shift by 6 bit each 16 bit srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32); srcRegFilt32b2_1 = _mm_adds_epi16(srcRegFilt32b2_1, addFilterReg32); srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6); srcRegFilt32b2_1 = _mm_srai_epi16(srcRegFilt32b2_1, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve result srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, srcRegFilt32b2_1); src_ptr += src_pixels_per_line; _mm_store_si128((__m128i *)output_ptr, srcRegFilt32b1_1); output_ptr += output_pitch; } }
inline void u32toa_sse2(uint32_t value, char* buffer) { if (value < 10000) { const uint32_t d1 = (value / 100) << 1; const uint32_t d2 = (value % 100) << 1; if (value >= 1000) *buffer++ = gDigitsLut[d1]; if (value >= 100) *buffer++ = gDigitsLut[d1 + 1]; if (value >= 10) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = '\0'; } else if (value < 100000000) { // Experiment shows that this case SSE2 is slower #if 0 const __m128i a = Convert8DigitsSSE2(value); // Convert to bytes, add '0' const __m128i va = _mm_add_epi8(_mm_packus_epi16(a, _mm_setzero_si128()), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); // Count number of digit const unsigned mask = _mm_movemask_epi8(_mm_cmpeq_epi8(va, reinterpret_cast<const __m128i*>(kAsciiZero)[0])); unsigned long digit; #ifdef _MSC_VER _BitScanForward(&digit, ~mask | 0x8000); #else digit = __builtin_ctz(~mask | 0x8000); #endif // Shift digits to the beginning __m128i result = ShiftDigits_SSE2(va, digit); //__m128i result = _mm_srl_epi64(va, _mm_cvtsi32_si128(digit * 8)); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); buffer[8 - digit] = '\0'; #else // value = bbbbcccc const uint32_t b = value / 10000; const uint32_t c = value % 10000; const uint32_t d1 = (b / 100) << 1; const uint32_t d2 = (b % 100) << 1; const uint32_t d3 = (c / 100) << 1; const uint32_t d4 = (c % 100) << 1; if (value >= 10000000) *buffer++ = gDigitsLut[d1]; if (value >= 1000000) *buffer++ = gDigitsLut[d1 + 1]; if (value >= 100000) *buffer++ = gDigitsLut[d2]; *buffer++ = gDigitsLut[d2 + 1]; *buffer++ = gDigitsLut[d3]; *buffer++ = gDigitsLut[d3 + 1]; *buffer++ = gDigitsLut[d4]; *buffer++ = gDigitsLut[d4 + 1]; *buffer++ = '\0'; #endif } else { // value = aabbbbbbbb in decimal const uint32_t a = value / 100000000; // 1 to 42 value %= 100000000; if (a >= 10) { const unsigned i = a << 1; *buffer++ = gDigitsLut[i]; *buffer++ = gDigitsLut[i + 1]; } else *buffer++ = '0' + static_cast<char>(a); const __m128i b = Convert8DigitsSSE2(value); const __m128i ba = _mm_add_epi8(_mm_packus_epi16(_mm_setzero_si128(), b), reinterpret_cast<const __m128i*>(kAsciiZero)[0]); const __m128i result = _mm_srli_si128(ba, 8); _mm_storel_epi64(reinterpret_cast<__m128i*>(buffer), result); buffer[8] = '\0'; } }
static void aom_filter_block1d16_v4_ssse3( const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr, ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) { __m128i filtersReg; __m128i srcReg2, srcReg3, srcReg4, srcReg5, srcReg6; __m128i srcReg23_lo, srcReg23_hi, srcReg34_lo, srcReg34_hi; __m128i srcReg45_lo, srcReg45_hi, srcReg56_lo, srcReg56_hi; __m128i resReg23_lo, resReg34_lo, resReg45_lo, resReg56_lo; __m128i resReg23_hi, resReg34_hi, resReg45_hi, resReg56_hi; __m128i resReg23_45_lo, resReg34_56_lo, resReg23_45_hi, resReg34_56_hi; __m128i resReg23_45, resReg34_56; __m128i addFilterReg32, secondFilters, thirdFilters; unsigned int i; ptrdiff_t src_stride, dst_stride; addFilterReg32 = _mm_set1_epi16(32); filtersReg = _mm_loadu_si128((const __m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the // same data in both lanes of 128 bit register. filtersReg = _mm_srai_epi16(filtersReg, 1); filtersReg = _mm_packs_epi16(filtersReg, filtersReg); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // multiple the size of the source and destination stride by two src_stride = src_pitch << 1; dst_stride = out_pitch << 1; srcReg2 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 2)); srcReg3 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 3)); srcReg23_lo = _mm_unpacklo_epi8(srcReg2, srcReg3); srcReg23_hi = _mm_unpackhi_epi8(srcReg2, srcReg3); srcReg4 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 4)); // have consecutive loads on the same 256 register srcReg34_lo = _mm_unpacklo_epi8(srcReg3, srcReg4); srcReg34_hi = _mm_unpackhi_epi8(srcReg3, srcReg4); for (i = output_height; i > 1; i -= 2) { srcReg5 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 5)); srcReg45_lo = _mm_unpacklo_epi8(srcReg4, srcReg5); srcReg45_hi = _mm_unpackhi_epi8(srcReg4, srcReg5); srcReg6 = _mm_loadu_si128((const __m128i *)(src_ptr + src_pitch * 6)); srcReg56_lo = _mm_unpacklo_epi8(srcReg5, srcReg6); srcReg56_hi = _mm_unpackhi_epi8(srcReg5, srcReg6); // multiply 2 adjacent elements with the filter and add the result resReg23_lo = _mm_maddubs_epi16(srcReg23_lo, secondFilters); resReg34_lo = _mm_maddubs_epi16(srcReg34_lo, secondFilters); resReg45_lo = _mm_maddubs_epi16(srcReg45_lo, thirdFilters); resReg56_lo = _mm_maddubs_epi16(srcReg56_lo, thirdFilters); // add and saturate the results together resReg23_45_lo = _mm_adds_epi16(resReg23_lo, resReg45_lo); resReg34_56_lo = _mm_adds_epi16(resReg34_lo, resReg56_lo); // multiply 2 adjacent elements with the filter and add the result resReg23_hi = _mm_maddubs_epi16(srcReg23_hi, secondFilters); resReg34_hi = _mm_maddubs_epi16(srcReg34_hi, secondFilters); resReg45_hi = _mm_maddubs_epi16(srcReg45_hi, thirdFilters); resReg56_hi = _mm_maddubs_epi16(srcReg56_hi, thirdFilters); // add and saturate the results together resReg23_45_hi = _mm_adds_epi16(resReg23_hi, resReg45_hi); resReg34_56_hi = _mm_adds_epi16(resReg34_hi, resReg56_hi); // shift by 6 bit each 16 bit resReg23_45_lo = _mm_adds_epi16(resReg23_45_lo, addFilterReg32); resReg34_56_lo = _mm_adds_epi16(resReg34_56_lo, addFilterReg32); resReg23_45_hi = _mm_adds_epi16(resReg23_45_hi, addFilterReg32); resReg34_56_hi = _mm_adds_epi16(resReg34_56_hi, addFilterReg32); resReg23_45_lo = _mm_srai_epi16(resReg23_45_lo, 6); resReg34_56_lo = _mm_srai_epi16(resReg34_56_lo, 6); resReg23_45_hi = _mm_srai_epi16(resReg23_45_hi, 6); resReg34_56_hi = _mm_srai_epi16(resReg34_56_hi, 6); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result resReg23_45 = _mm_packus_epi16(resReg23_45_lo, resReg23_45_hi); resReg34_56 = _mm_packus_epi16(resReg34_56_lo, resReg34_56_hi); src_ptr += src_stride; _mm_store_si128((__m128i *)output_ptr, (resReg23_45)); _mm_store_si128((__m128i *)(output_ptr + out_pitch), (resReg34_56)); output_ptr += dst_stride; // save part of the registers for next strides srcReg23_lo = srcReg45_lo; srcReg34_lo = srcReg56_lo; srcReg23_hi = srcReg45_hi; srcReg34_hi = srcReg56_hi; srcReg4 = srcReg6; } }
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData ) { ALIGN16( byte color0[16] ); ALIGN16( byte color1[16] ); ALIGN16( byte color2[16] ); ALIGN16( byte color3[16] ); ALIGN16( byte result[16] ); // mov esi, maxColor // mov edi, minColor __m128i t0, t1, t2, t3, t4, t5, t6, t7; t7 = _mm_setzero_si128(); //t7 = _mm_xor_si128(t7, t7); _mm_store_si128 ( (__m128i*) &result, t7 ); //t0 = _mm_load_si128 ( (__m128i*) maxColor ); t0 = _mm_cvtsi32_si128( *(int*)maxColor); // Bitwise AND __m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask ); t0 = _mm_and_si128(t0, tt); t0 = _mm_unpacklo_epi8(t0, t7); t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); // Bitwise Logical OR t0 = _mm_or_si128(t0, t4); t0 = _mm_or_si128(t0, t5); // t0 contains color0 in 565 //t1 = _mm_load_si128 ( (__m128i*) minColor ); t1 = _mm_cvtsi32_si128( *(int*)minColor); t1 = _mm_and_si128(t1, tt); t1 = _mm_unpacklo_epi8(t1, t7); t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 )); t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 )); t4 = _mm_srli_epi16(t4, 5); t5 = _mm_srli_epi16(t5, 6); t1 = _mm_or_si128(t1, t4); t1 = _mm_or_si128(t1, t5); // t1 contains color1 in 565 t2 = t0; t2 = _mm_packus_epi16(t2, t7); t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color0, t2 ); t6 = t0; t6 = _mm_add_epi16(t6, t0); t6 = _mm_add_epi16(t6, t1); // Multiply Packed Signed Integers and Store High Result __m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 ); t6 = _mm_mulhi_epi16(t6, tw3); t6 = _mm_packus_epi16(t6, t7); t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color2, t6 ); t3 = t1; t3 = _mm_packus_epi16(t3, t7); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color1, t3 ); t1 = _mm_add_epi16(t1, t1); t0 = _mm_add_epi16(t0, t1); t0 = _mm_mulhi_epi16(t0, tw3); t0 = _mm_packus_epi16(t0, t7); t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 )); _mm_store_si128 ( (__m128i*) &color3, t0 ); __m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0); __m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1); __m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2); // mov eax, 32 // mov esi, colorBlock int x = 32; //const byte *c = colorBlock; while (x >= 0) { t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0)); t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t0 = t3; t6 = t5; // Compute Sum of Absolute Difference __m128i c0 = _mm_load_si128 ( (__m128i*) color0 ); t0 = _mm_sad_epu8(t0, c0); t6 = _mm_sad_epu8(t6, c0); // Pack with Signed Saturation t0 = _mm_packs_epi32 (t0, t6); t1 = t3; t6 = t5; __m128i c1 = _mm_load_si128 ( (__m128i*) color1 ); t1 = _mm_sad_epu8(t1, c1); t6 = _mm_sad_epu8(t6, c1); t1 = _mm_packs_epi32 (t1, t6); t2 = t3; t6 = t5; __m128i c2 = _mm_load_si128 ( (__m128i*) color2 ); t2 = _mm_sad_epu8(t2, c2); t6 = _mm_sad_epu8(t6, c2); t2 = _mm_packs_epi32 (t2, t6); __m128i c3 = _mm_load_si128 ( (__m128i*) color3 ); t3 = _mm_sad_epu8(t3, c3); t5 = _mm_sad_epu8(t5, c3); t3 = _mm_packs_epi32 (t3, t5); t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16)); t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 )); t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24)); t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 )); t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c0); t7 = _mm_sad_epu8(t7, c0); t6 = _mm_packs_epi32 (t6, t7); t0 = _mm_packs_epi32 (t0, t6); // d0 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c1); t7 = _mm_sad_epu8(t7, c1); t6 = _mm_packs_epi32 (t6, t7); t1 = _mm_packs_epi32 (t1, t6); // d1 t6 = t4; t7 = t5; t6 = _mm_sad_epu8(t6, c2); t7 = _mm_sad_epu8(t7, c2); t6 = _mm_packs_epi32 (t6, t7); t2 = _mm_packs_epi32 (t2, t6); // d2 t4 = _mm_sad_epu8(t4, c3); t5 = _mm_sad_epu8(t5, c3); t4 = _mm_packs_epi32 (t4, t5); t3 = _mm_packs_epi32 (t3, t4); // d3 t7 = _mm_load_si128 ( (__m128i*) result ); t7 = _mm_slli_epi32( t7, 16); t4 = t0; t5 = t1; // Compare Packed Signed Integers for Greater Than t0 = _mm_cmpgt_epi16(t0, t3); // b0 t1 = _mm_cmpgt_epi16(t1, t2); // b1 t4 = _mm_cmpgt_epi16(t4, t2); // b2 t5 = _mm_cmpgt_epi16(t5, t3); // b3 t2 = _mm_cmpgt_epi16(t2, t3); // b4 t4 = _mm_and_si128(t4, t1); // x0 t5 = _mm_and_si128(t5, t0); // x1 t2 = _mm_and_si128(t2, t0); // x2 t4 = _mm_or_si128(t4, t5); t2 = _mm_and_si128(t2, w1); t4 = _mm_and_si128(t4, w2); t2 = _mm_or_si128(t2, t4); t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 )); // Unpack Low Data t2 = _mm_unpacklo_epi16 ( t2, w0); t5 = _mm_unpacklo_epi16 ( t5, w0); //t5 = _mm_slli_si128 ( t5, 8); t5 = _mm_slli_epi32( t5, 8); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t2); _mm_store_si128 ( (__m128i*) &result, t7 ); x -=32; } t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 )); t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 )); t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 )); t4 = _mm_slli_epi32 ( t4, 2); t5 = _mm_slli_epi32 ( t5, 4); t6 = _mm_slli_epi32 ( t6, 6); t7 = _mm_or_si128(t7, t4); t7 = _mm_or_si128(t7, t5); t7 = _mm_or_si128(t7, t6); //_mm_store_si128 ( (__m128i*) outData, t7 ); int r = _mm_cvtsi128_si32 (t7); memcpy(outData, &r, 4); // Anything better ? outData += 4; }
SIMD_INLINE __m128i YuvToHue8(__m128i y, __m128i u, __m128i v, const __m128 & KF_255_DIV_6) { return _mm_packus_epi16( YuvToHue16(_mm_unpacklo_epi8(y, K_ZERO), _mm_unpacklo_epi8(u, K_ZERO), _mm_unpacklo_epi8(v, K_ZERO), KF_255_DIV_6), YuvToHue16(_mm_unpackhi_epi8(y, K_ZERO), _mm_unpackhi_epi8(u, K_ZERO), _mm_unpackhi_epi8(v, K_ZERO), KF_255_DIV_6)); }
void mlib_s_ImageBlendLine( mlib_work_image * param, mlib_u8 *dp, __m128i * buffz, __m128i * buffd) { mlib_blend blend = param->blend; mlib_s32 chan_d = param->chan_d; mlib_s32 chan_s = param->channels; mlib_d64 alp = (param->alpha) * (1.0 / 255); mlib_s32 width = GetElemSubStruct(current, width); mlib_u8 *tdp = dp; mlib_s32 width2, y_step, next_step = 2; mlib_s32 alp_ind = param->alp_ind, mask255; __m128i aa, dalp, done; __m128i mzero, mask_7fff, mask_8000, amask, amask256, amaskffff; __m128i d_rnd; mlib_s32 i, j; if (!alp_ind) { d_rnd = _mm_set1_epi16(0x0080); tdp = (void *)dp; if (chan_d == 3) tdp = (void *)buffd; for (i = 0; i < width / 2; i++) { __m128i dd; dd = buffz[i]; dd = _mm_adds_epu16(dd, d_rnd); dd = _mm_srli_epi16(dd, 8); dd = _mm_packus_epi16(dd, dd); _mm_storel_epi64((void *)(tdp + 8 * i), dd); } if (width & 1) { __m128i dd; dd = buffz[i]; dd = _mm_adds_epu16(dd, d_rnd); dd = _mm_srli_epi16(dd, 8); dd = _mm_packus_epi16(dd, dd); *(mlib_s32 *)(tdp + 8 * i) = *(mlib_s32 *)ⅆ } if (chan_d == 3) { mlib_s_ImageChannelExtract_U8_43L_D1((void *)buffd, dp, width); } return; } width2 = (width + 1) / 2; mzero = _mm_setzero_si128(); mask_7fff = _mm_set1_epi16(0x7FFF); mask_8000 = _mm_set1_epi16(0x8000); done = _mm_set1_epi16(1 << 15); if (alp_ind == -1) { mask255 = 0xFF; amask = _mm_setr_epi32(0xff00, 0, 0xff00, 0); amaskffff = _mm_setr_epi32(0xffff, 0, 0xffff, 0); amask256 = _mm_setr_epi32(0x0100, 0, 0x0100, 0); } else { mask255 = 0xFF000000; amask = _mm_setr_epi32(0, 0xff000000, 0, 0xff000000); amaskffff = _mm_setr_epi32(0, 0xffff0000, 0, 0xffff0000); amask256 = _mm_setr_epi32(0, 0x01000000, 0, 0x01000000); } dalp = _mm_set1_epi16((1 << 15) * alp + 0.5); if (chan_s == 3) { if (chan_d == 3) { mlib_d64 alp = (param->alpha) * (1.0 / 255); mlib_s32 ialp; mlib_u8 *pz; __m128i emask; __m128i dalp, ralp, ss, dd, s0, s1, d0, d1, dr; mlib_s_ImageChannelExtract_S16_43L_D1((void *)buffz, (void *)buffd, width); ialp = alp * (1 << 15); dalp = _mm_set1_epi16(ialp); ralp = _mm_set1_epi16((1 << 15) - ialp); emask = mlib_emask_m128i[(3 * width) & 15].m128i; pz = (void *)buffd; tdp = dp; for (i = 0; i <= 3 * width - 16; i += 16) { s0 = _mm_load_si128((__m128i *) (pz + 2 * i)); s1 = _mm_load_si128((__m128i *) (pz + 2 * i + 16)); dd = _mm_loadu_si128((__m128i *) (tdp + i)); d0 = _mm_unpacklo_epi8(mzero, dd); d1 = _mm_unpackhi_epi8(mzero, dd); d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp), _mm_mulhi_epu16(d0, ralp)); d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp), _mm_mulhi_epu16(d1, ralp)); d0 = _mm_srli_epi16(d0, 7); d1 = _mm_srli_epi16(d1, 7); dr = _mm_packus_epi16(d0, d1); _mm_storeu_si128((__m128i *) (tdp + i), dr); } if (i < 3 * width) { s0 = _mm_load_si128((__m128i *) (pz + 2 * i)); s1 = _mm_load_si128((__m128i *) (pz + 2 * i + 16)); dd = _mm_loadu_si128((__m128i *) (tdp + i)); d0 = _mm_unpacklo_epi8(mzero, dd); d1 = _mm_unpackhi_epi8(mzero, dd); d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp), _mm_mulhi_epu16(d0, ralp)); d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp), _mm_mulhi_epu16(d1, ralp)); d0 = _mm_srli_epi16(d0, 7); d1 = _mm_srli_epi16(d1, 7); dr = _mm_packus_epi16(d0, d1); dr = _mm_or_si128(_mm_and_si128(emask, dr), _mm_andnot_si128(emask, dd)); _mm_storeu_si128((__m128i *) (tdp + i), dr); } } else if (blend == MLIB_BLEND_GTK_SRC) { mlib_u8 *buffi = (mlib_u8 *)buffz + 1; for (i = 0; i < width; i++) { tdp[0] = buffi[0]; tdp[1] = buffi[2]; tdp[2] = buffi[4]; tdp[alp_ind] = 255; tdp += 4; buffi += 8; } } else { mlib_d64 _w0 = param->alpha; mlib_d64 _w1s = 1.0 - _w0 * (1.0 / 255); __m128i buff[1]; __m128i done; __m128i dalp, ralp, ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr; __m128i wi, aa, amask; __m128 af, w0, w1, w1s, w, rw, w0r, w1r, scale; done = _mm_set1_epi16(1 << 15); amask = _mm_set1_epi32(mask255); w0 = _mm_set_ps1(_w0); w1s = _mm_set_ps1(_w1s); scale = _mm_set_ps1(1 << 15); if (alp_ind == -1) { tdp--; for (i = 0; i < width / 4; i++) { BLEND34_SRC_OVER(0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND34_SRC_OVER(0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND34_SRC_OVER(3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND34_SRC_OVER(3); buff[0] = dr; } } for (i = 0; i < (width & 3); i++) { ((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i]; } } } else if (chan_d == 3) { if (blend != MLIB_BLEND_GTK_SRC) { if (alp_ind == -1) { tdp--; } for (i = 0; i < width; i++) { ((mlib_s32 *)buffd)[i] = *(mlib_s32 *)(tdp + 3 * i); } if (alp_ind == -1) { for (i = 0; i < width2; i++) { __m128i a0, s0, d0, dd; BLEND43_SRC_OVER(0); } mlib_s_ImageChannelExtract_U8_43R_D1((void *) buffd, dp, width); } else { for (i = 0; i < width2; i++) { __m128i a0, s0, d0, dd; BLEND43_SRC_OVER(0xff); } mlib_s_ImageChannelExtract_U8_43L_D1((void *) buffd, dp, width); } } else { mlib_u8 *buffi = (mlib_u8 *)buffz + 1; if (alp_ind == -1) buffi += 2; for (i = 0; i < width; i++) { tdp[0] = buffi[0]; tdp[1] = buffi[2]; tdp[2] = buffi[4]; tdp += 3; buffi += 8; } } } else { /* if (chan_d == 4) */ if (alp_ind == -1) { tdp--; } if (blend == MLIB_BLEND_GTK_SRC) { mlib_u8 *p_alp = (mlib_u8 *)buffz + 1; mlib_s32 tail = ((mlib_s32 *)tdp)[width]; if (alp_ind != -1) p_alp += 6; for (i = 0; i < width2; i++) { __m128i a0, a1, aa, ss, d0, dd; ss = buffz[i]; a0 = _mm_loadl_epi64((void *)((mlib_d64 *) mlib_m_tbl_255DivAlpha + p_alp[0])); a1 = _mm_loadl_epi64((void *)((mlib_d64 *) mlib_m_tbl_255DivAlpha + p_alp[8])); aa = _mm_unpacklo_epi64(a0, a1); aa = _mm_or_si128(amask256, _mm_andnot_si128(amaskffff, aa)); d0 = _mm_mulhi_epu16(ss, aa); dd = _mm_packus_epi16(d0, d0); _mm_storel_epi64((void *)(tdp + 8 * i), dd); p_alp += 16; } ((mlib_s32 *)tdp)[width] = tail; } else { mlib_blend blend = param->blend; mlib_d64 alp = (param->alpha) * (1.0 / 255); __m128i buff[1]; __m128i done; __m128i ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr; __m128i wi, aa, amask, a16mask, zero_mask_i; __m128 dalp, div255, alpha, fone; __m128 af, sf, w0, w1, w1s, w, rw, w0r, w1r, scale; __m128 zero_mask, f_rnd; mlib_m128 s0u, s1u, s2u, s3u; done = _mm_set1_epi16(1 << 14); amask = _mm_set1_epi32(mask255); a16mask = _mm_set1_epi32(0xFFFF); dalp = _mm_set_ps1(alp * (1.0 / 256)); fone = _mm_set_ps1(1.0); div255 = _mm_set_ps1(1.0 / 255); scale = _mm_set_ps1(1 << 8); alpha = _mm_set_ps1((float)(param->alpha) + 0.5); f_rnd = _mm_set_ps1(0.6); if (blend == MLIB_BLEND_GTK_SRC_OVER2) { if (alp_ind == -1) { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER2, 0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER2, 0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER2, 3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER2, 3); buff[0] = dr; } } } else { if (alp_ind == -1) { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER, 0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER, 0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER, 3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER, 3); buff[0] = dr; } } } for (i = 0; i < (width & 3); i++) { ((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i]; } } } }
void png_read_filter_row_paeth4_sse2(png_row_infop row_info, png_bytep row, png_const_bytep prev) { /* Paeth tries to predict pixel d using the pixel to the left of it, a, * and two pixels from the previous row, b and c: * prev: c b * row: a d * The Paeth function predicts d to be whichever of a, b, or c is nearest to * p=a+b-c. * * The first pixel has no left context, and so uses an Up filter, p = b. * This works naturally with our main loop's p = a+b-c if we force a and c * to zero. * Here we zero b and d, which become c and a respectively at the start of * the loop. */ png_size_t rb; const __m128i zero = _mm_setzero_si128(); __m128i pa,pb,pc,smallest,nearest; __m128i c, b = zero, a, d = zero; png_debug(1, "in png_read_filter_row_paeth4_sse2"); rb = row_info->rowbytes+4; while (rb > 4) { /* It's easiest to do this math (particularly, deal with pc) with 16-bit * intermediates. */ c = b; b = _mm_unpacklo_epi8(load4(prev), zero); a = d; d = _mm_unpacklo_epi8(load4(row ), zero); /* (p-a) == (a+b-c - a) == (b-c) */ pa = _mm_sub_epi16(b,c); /* (p-b) == (a+b-c - b) == (a-c) */ pb = _mm_sub_epi16(a,c); /* (p-c) == (a+b-c - c) == (a+b-c-c) == (b-c)+(a-c) */ pc = _mm_add_epi16(pa,pb); pa = abs_i16(pa); /* |p-a| */ pb = abs_i16(pb); /* |p-b| */ pc = abs_i16(pc); /* |p-c| */ smallest = _mm_min_epi16(pc, _mm_min_epi16(pa, pb)); /* Paeth breaks ties favoring a over b over c. */ nearest = if_then_else(_mm_cmpeq_epi16(smallest, pa), a, if_then_else(_mm_cmpeq_epi16(smallest, pb), b, c)); /* Note `_epi8`: we need addition to wrap modulo 255. */ d = _mm_add_epi8(d, nearest); store4(row, _mm_packus_epi16(d,d)); prev += 4; row += 4; rb -= 4; } }
void Viterbi::AlignWithOutCellOff(HMMSimd* q, HMMSimd* t,ViterbiMatrix * viterbiMatrix, int maxres, ViterbiResult* result) #endif #endif { // Linear topology of query (and template) HMM: // 1. The HMM HMM has L+2 columns. Columns 1 to L contain // a match state, a delete state and an insert state each. // 2. The Start state is M0, the virtual match state in column i=0 (j=0). (Therefore X[k][0]=ANY) // This column has only a match state and it has only a transitions to the next match state. // 3. The End state is M(L+1), the virtual match state in column i=L+1.(j=L+1) (Therefore X[k][L+1]=ANY) // Column L has no transitions to the delete state: tr[L][M2D]=tr[L][D2D]=0. // 4. Transitions I->D and D->I are ignored, since they do not appear in PsiBlast alignments // (as long as the gap opening penalty d is higher than the best match score S(a,b)). // Pairwise alignment of two HMMs: // 1. Pair-states for the alignment of two HMMs are // MM (Q:Match T:Match) , GD (Q:Gap T:Delete), IM (Q:Insert T:Match), DG (Q:Delelte, T:Match) , MI (Q:Match T:Insert) // 2. Transitions are allowed only between the MM-state and each of the four other states. // Saving space: // The best score ending in pair state XY sXY[i][j] is calculated from left to right (j=1->t->L) // and top to bottom (i=1->q->L). To save space, only the last row of scores calculated is kept in memory. // (The backtracing matrices are kept entirely in memory [O(t->L*q->L)]). // When the calculation has proceeded up to the point where the scores for cell (i,j) are caculated, // sXY[i-1][j'] = sXY[j'] for j'>=j (A below) // sXY[i][j'] = sXY[j'] for j'<j (B below) // sXY[i-1][j-1]= sXY_i_1_j_1 (C below) // sXY[i][j] = sXY_i_j (D below) // j-1 // j // i-1: CAAAAAAAAAAAAAAAAAA // i : BBBBBBBBBBBBBD // Variable declarations const float smin = (this->local ? 0 : -FLT_MAX); //used to distinguish between SW and NW algorithms in maximization const simd_float smin_vec = simdf32_set(smin); const simd_float shift_vec = simdf32_set(shift); // const simd_float one_vec = simdf32_set(1); // 00000001 const simd_int mm_vec = simdi32_set(2); //MM 00000010 const simd_int gd_vec = simdi32_set(3); //GD 00000011 const simd_int im_vec = simdi32_set(4); //IM 00000100 const simd_int dg_vec = simdi32_set(5); //DG 00000101 const simd_int mi_vec = simdi32_set(6); //MI 00000110 const simd_int gd_mm_vec = simdi32_set(8); // 00001000 const simd_int im_mm_vec = simdi32_set(16);// 00010000 const simd_int dg_mm_vec = simdi32_set(32);// 00100000 const simd_int mi_mm_vec = simdi32_set(64);// 01000000 #ifdef VITERBI_SS_SCORE HMM * q_s = q->GetHMM(0); const unsigned char * t_index; if(ss_hmm_mode == HMM::PRED_PRED || ss_hmm_mode == HMM::DSSP_PRED ){ t_index = t->pred_index; }else if(ss_hmm_mode == HMM::PRED_DSSP){ t_index = t->dssp_index; } simd_float * ss_score_vec = (simd_float *) ss_score; #endif #ifdef AVX2 const simd_int shuffle_mask_extract = _mm256_setr_epi8(0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 0, 4, 8, 12, -1, -1, -1, -1, -1, -1, -1, -1); #endif #ifdef VITERBI_CELLOFF const __m128i tmp_vec = _mm_set_epi32(0x40000000,0x00400000,0x00004000,0x00000040);//01000000010000000100000001000000 #ifdef AVX2 const simd_int co_vec = _mm256_inserti128_si256(_mm256_castsi128_si256(tmp_vec), tmp_vec, 1); const simd_int float_min_vec = (simd_int) _mm256_set1_ps(-FLT_MAX); const simd_int shuffle_mask_celloff = _mm256_set_epi8( 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 15, 14, 13, 12, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0, 3, 2, 1, 0); #else // SSE case const simd_int co_vec = tmp_vec; const simd_int float_min_vec = (simd_int) simdf32_set(-FLT_MAX); #endif #endif // AVX2 end int i,j; //query and template match state indices simd_int i2_vec = simdi32_set(0); simd_int j2_vec = simdi32_set(0); simd_float sMM_i_j = simdf32_set(0); simd_float sMI_i_j,sIM_i_j,sGD_i_j,sDG_i_j; simd_float Si_vec; simd_float sMM_i_1_j_1; simd_float sMI_i_1_j_1; simd_float sIM_i_1_j_1; simd_float sGD_i_1_j_1; simd_float sDG_i_1_j_1; simd_float score_vec = simdf32_set(-FLT_MAX); simd_int byte_result_vec = simdi32_set(0); // Initialization of top row, i.e. cells (0,j) for (j=0; j <= t->L; ++j) { const unsigned int index_pos_j = j * 5; sMM_DG_MI_GD_IM_vec[index_pos_j + 0] = simdf32_set(-j*penalty_gap_template); sMM_DG_MI_GD_IM_vec[index_pos_j + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_j + 4] = simdf32_set(-FLT_MAX); } // Viterbi algorithm const int queryLength = q->L; for (i=1; i <= queryLength; ++i) // Loop through query positions i { // If q is compared to t, exclude regions where overlap of q with t < min_overlap residues // Initialize cells sMM_i_1_j_1 = simdf32_set(-(i - 1) * penalty_gap_query); // initialize at (i-1,0) sIM_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i-1,jmin-1) sMI_i_1_j_1 = simdf32_set(-FLT_MAX); sDG_i_1_j_1 = simdf32_set(-FLT_MAX); sGD_i_1_j_1 = simdf32_set(-FLT_MAX); // initialize at (i,jmin-1) const unsigned int index_pos_i = 0 * 5; sMM_DG_MI_GD_IM_vec[index_pos_i + 0] = simdf32_set(-i * penalty_gap_query); // initialize at (i,0) sMM_DG_MI_GD_IM_vec[index_pos_i + 1] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 2] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 3] = simdf32_set(-FLT_MAX); sMM_DG_MI_GD_IM_vec[index_pos_i + 4] = simdf32_set(-FLT_MAX); #ifdef AVX2 unsigned long long * sCO_MI_DG_IM_GD_MM_vec = (unsigned long long *) viterbiMatrix->getRow(i); #else unsigned int *sCO_MI_DG_IM_GD_MM_vec = (unsigned int *) viterbiMatrix->getRow(i); #endif const unsigned int start_pos_tr_i_1 = (i - 1) * 7; const unsigned int start_pos_tr_i = (i) * 7; const simd_float q_m2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 2)); // M2M const simd_float q_m2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 3)); // M2D const simd_float q_d2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 4)); // D2M const simd_float q_d2d = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 5)); // D2D const simd_float q_i2m = simdf32_load((float *) (q->tr + start_pos_tr_i_1 + 6)); // I2m const simd_float q_i2i = simdf32_load((float *) (q->tr + start_pos_tr_i)); // I2I const simd_float q_m2i = simdf32_load((float *) (q->tr + start_pos_tr_i + 1)); // M2I // Find maximum score; global alignment: maxize only over last row and last column const bool findMaxInnerLoop = (local || i == queryLength); const int targetLength = t->L; #ifdef VITERBI_SS_SCORE if(ss_hmm_mode == HMM::NO_SS_INFORMATION){ // set all to log(1.0) = 0.0 for (j = 0; j <= (targetLength*VEC_SIZE); j++) // Loop through template positions j { ss_score[j] = 0.0; } }else { const float * score; if(ss_hmm_mode == HMM::PRED_PRED){ score = &S33[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0][0]; }else if (ss_hmm_mode == HMM::DSSP_PRED){ score = &S73[ (int)q_s->ss_dssp[i]][0][0]; }else{ score = &S37[ (int)q_s->ss_pred[i]][ (int)q_s->ss_conf[i]][0]; } // access SS scores and write them to the ss_score array for (j = 0; j <= (targetLength*VEC_SIZE); j++) // Loop through template positions j { ss_score[j] = ssw * score[t_index[j]]; } } #endif for (j=1; j <= targetLength; ++j) // Loop through template positions j { simd_int index_vec; simd_int res_gt_vec; // cache line optimized reading const unsigned int start_pos_tr_j_1 = (j-1) * 7; const unsigned int start_pos_tr_j = (j) * 7; const simd_float t_m2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+2)); // M2M const simd_float t_m2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+3)); // M2D const simd_float t_d2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+4)); // D2M const simd_float t_d2d = simdf32_load((float *) (t->tr+start_pos_tr_j_1+5)); // D2D const simd_float t_i2m = simdf32_load((float *) (t->tr+start_pos_tr_j_1+6)); // I2m const simd_float t_i2i = simdf32_load((float *) (t->tr+start_pos_tr_j)); // I2i const simd_float t_m2i = simdf32_load((float *) (t->tr+start_pos_tr_j+1)); // M2I // Find max value // CALCULATE_MAX6( sMM_i_j, // smin, // sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M], // sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M], // sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M], // sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M], // sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], // bMM[i][j] // ); // same as sMM_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][M2M] simd_float mm_m2m_m2m_vec = simdf32_add( simdf32_add(sMM_i_1_j_1, q_m2m), t_m2m); // if mm > min { 2 } res_gt_vec = (simd_int)simdf32_gt(mm_m2m_m2m_vec, smin_vec); byte_result_vec = simdi_and(res_gt_vec, mm_vec); sMM_i_j = simdf32_max(smin_vec, mm_m2m_m2m_vec); // same as sGD_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][D2M] simd_float gd_m2m_d2m_vec = simdf32_add( simdf32_add(sGD_i_1_j_1, q_m2m), t_d2m); // if gd > max { 3 } res_gt_vec = (simd_int)simdf32_gt(gd_m2m_d2m_vec, sMM_i_j); index_vec = simdi_and( res_gt_vec, gd_vec); byte_result_vec = simdi_or( index_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, gd_m2m_d2m_vec); // same as sIM_i_1_j_1 + q->tr[i-1][I2M] + t->tr[j-1][M2M] simd_float im_m2m_d2m_vec = simdf32_add( simdf32_add(sIM_i_1_j_1, q_i2m), t_m2m); // if im > max { 4 } MAX2(im_m2m_d2m_vec, sMM_i_j, im_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, im_m2m_d2m_vec); // same as sDG_i_1_j_1 + q->tr[i-1][D2M] + t->tr[j-1][M2M] simd_float dg_m2m_d2m_vec = simdf32_add( simdf32_add(sDG_i_1_j_1, q_d2m), t_m2m); // if dg > max { 5 } MAX2(dg_m2m_d2m_vec, sMM_i_j, dg_vec,byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, dg_m2m_d2m_vec); // same as sMI_i_1_j_1 + q->tr[i-1][M2M] + t->tr[j-1][I2M], simd_float mi_m2m_d2m_vec = simdf32_add( simdf32_add(sMI_i_1_j_1, q_m2m), t_i2m); // if mi > max { 6 } MAX2(mi_m2m_d2m_vec, sMM_i_j, mi_vec, byte_result_vec); sMM_i_j = simdf32_max(sMM_i_j, mi_m2m_d2m_vec); // TODO add secondary structure score // calculate amino acid profile-profile scores Si_vec = log2f4(ScalarProd20Vec((simd_float *) q->p[i],(simd_float *) t->p[j])); #ifdef VITERBI_SS_SCORE Si_vec = simdf32_add(ss_score_vec[j], Si_vec); #endif Si_vec = simdf32_add(Si_vec, shift_vec); sMM_i_j = simdf32_add(sMM_i_j, Si_vec); //+ ScoreSS(q,t,i,j) + shift + (Sstruc==NULL? 0: Sstruc[i][j]); const unsigned int index_pos_j = (j * 5); const unsigned int index_pos_j_1 = (j - 1) * 5; const simd_float sMM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 0)); const simd_float sGD_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 3)); const simd_float sIM_j_1 = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j_1 + 4)); const simd_float sMM_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); const simd_float sDG_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); const simd_float sMI_j = simdf32_load((float *) (sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sMM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 0)); sDG_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 1)); sMI_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 2)); sGD_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 3)); sIM_i_1_j_1 = simdf32_load((float *)(sMM_DG_MI_GD_IM_vec + index_pos_j + 4)); // sGD_i_j = max2 // ( // sMM[j-1] + t->tr[j-1][M2D], // MM->GD gap opening in query // sGD[j-1] + t->tr[j-1][D2D], // GD->GD gap extension in query // bGD[i][j] // ); //sMM_DG_GD_MI_IM_vec simd_float mm_gd_vec = simdf32_add(sMM_j_1, t_m2d); // MM->GD gap opening in query simd_float gd_gd_vec = simdf32_add(sGD_j_1, t_d2d); // GD->GD gap extension in query // if mm_gd > gd_dg { 8 } MAX2_SET_MASK(mm_gd_vec, gd_gd_vec,gd_mm_vec, byte_result_vec); sGD_i_j = simdf32_max( mm_gd_vec, gd_gd_vec ); // sIM_i_j = max2 // ( // sMM[j-1] + q->tr[i][M2I] + t->tr[j-1][M2M] , // sIM[j-1] + q->tr[i][I2I] + t->tr[j-1][M2M], // IM->IM gap extension in query // bIM[i][j] // ); simd_float mm_mm_vec = simdf32_add(simdf32_add(sMM_j_1, q_m2i), t_m2m); simd_float im_im_vec = simdf32_add(simdf32_add(sIM_j_1, q_i2i), t_m2m); // IM->IM gap extension in query // if mm_mm > im_im { 16 } MAX2_SET_MASK(mm_mm_vec,im_im_vec, im_mm_vec, byte_result_vec); sIM_i_j = simdf32_max( mm_mm_vec, im_im_vec ); // sDG_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2D], // sDG[j] + q->tr[i-1][D2D], //gap extension (DD) in query // bDG[i][j] // ); simd_float mm_dg_vec = simdf32_add(sMM_j, q_m2d); simd_float dg_dg_vec = simdf32_add(sDG_j, q_d2d); //gap extension (DD) in query // if mm_dg > dg_dg { 32 } MAX2_SET_MASK(mm_dg_vec,dg_dg_vec, dg_mm_vec, byte_result_vec); sDG_i_j = simdf32_max( mm_dg_vec , dg_dg_vec ); // sMI_i_j = max2 // ( // sMM[j] + q->tr[i-1][M2M] + t->tr[j][M2I], // MM->MI gap opening M2I in template // sMI[j] + q->tr[i-1][M2M] + t->tr[j][I2I], // MI->MI gap extension I2I in template // bMI[i][j] // ); simd_float mm_mi_vec = simdf32_add( simdf32_add(sMM_j, q_m2m), t_m2i); // MM->MI gap opening M2I in template simd_float mi_mi_vec = simdf32_add( simdf32_add(sMI_j, q_m2m), t_i2i); // MI->MI gap extension I2I in template // if mm_mi > mi_mi { 64 } MAX2_SET_MASK(mm_mi_vec, mi_mi_vec,mi_mm_vec, byte_result_vec); sMI_i_j = simdf32_max( mm_mi_vec, mi_mi_vec ); // Cell of logic // if (cell_off[i][j]) //shift 10000000100000001000000010000000 -> 01000000010000000100000001000000 //because 10000000000000000000000000000000 = -2147483648 kills cmplt #ifdef VITERBI_CELLOFF #ifdef AVX2 // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x4040404040404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x4040404040404040 ) << std::endl; // } simd_int matrix_vec = _mm256_set1_epi64x(sCO_MI_DG_IM_GD_MM_vec[j]>>1); matrix_vec = _mm256_shuffle_epi8(matrix_vec,shuffle_mask_celloff); #else // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040) > 0){ // std::cout << ((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x40404040 ) << std::endl; // } simd_int matrix_vec = simdi32_set(sCO_MI_DG_IM_GD_MM_vec[j]>>1); #endif simd_int cell_off_vec = simdi_and(matrix_vec, co_vec); simd_int res_eq_co_vec = simdi32_gt(co_vec, cell_off_vec ); // shift is because signed can't be checked here simd_float cell_off_float_min_vec = (simd_float) simdi_andnot(res_eq_co_vec, float_min_vec); // inverse // if(((sCO_MI_DG_IM_GD_MM_vec[j] >>1) & 0x4040404040404040) > 0){ // for(int i = 0; i < 8; i++){ // std::cout << i << " " << j << " " << ((float *) &cell_off_float_min_vec )[i] << " "; // } // std::cout << std::endl; // } sMM_i_j = simdf32_add(sMM_i_j,cell_off_float_min_vec); // add the cell off vec to sMM_i_j. Set -FLT_MAX to cell off sGD_i_j = simdf32_add(sGD_i_j,cell_off_float_min_vec); sIM_i_j = simdf32_add(sIM_i_j,cell_off_float_min_vec); sDG_i_j = simdf32_add(sDG_i_j,cell_off_float_min_vec); sMI_i_j = simdf32_add(sMI_i_j,cell_off_float_min_vec); #endif simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 0), sMM_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 1), sDG_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 2), sMI_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 3), sGD_i_j); simdf32_store((float *)(sMM_DG_MI_GD_IM_vec+index_pos_j + 4), sIM_i_j); // write values back to ViterbiMatrix #ifdef AVX2 /* byte_result_vec 000H 000G 000F 000E 000D 000C 000B 000A */ /* abcdefgh 0000 0000 HGFE 0000 0000 0000 0000 DCBA */ const __m256i abcdefgh = _mm256_shuffle_epi8(byte_result_vec, shuffle_mask_extract); /* abcd 0000 0000 0000 DCBA */ const __m128i abcd = _mm256_castsi256_si128(abcdefgh); /* efgh 0000 0000 HGFE 0000 */ const __m128i efgh = _mm256_extracti128_si256(abcdefgh, 1); _mm_storel_epi64((__m128i*)&sCO_MI_DG_IM_GD_MM_vec[j], _mm_or_si128(abcd, efgh)); #else byte_result_vec = _mm_packs_epi32(byte_result_vec, byte_result_vec); byte_result_vec = _mm_packus_epi16(byte_result_vec, byte_result_vec); int int_result = _mm_cvtsi128_si32(byte_result_vec); sCO_MI_DG_IM_GD_MM_vec[j] = int_result; #endif // Find maximum score; global alignment: maxize only over last row and last column // if(sMM_i_j>score && (par.loc || i==q->L)) { i2=i; j2=j; score=sMM_i_j; } if (findMaxInnerLoop){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec=simdf32_max(sMM_i_j,score_vec); } } //end for j // if global alignment: look for best cell in last column if (!local){ // new score is higer // output // 0 0 0 MAX simd_int lookup_mask_hi = (simd_int) simdf32_gt(sMM_i_j,score_vec); // old score is higher // output // MAX MAX MAX 0 simd_int lookup_mask_lo = (simd_int) simdf32_lt(sMM_i_j,score_vec); simd_int curr_pos_j = simdi32_set(j); simd_int new_j_pos_hi = simdi_and(lookup_mask_hi,curr_pos_j); simd_int old_j_pos_lo = simdi_and(lookup_mask_lo,j2_vec); j2_vec = simdi32_add(new_j_pos_hi,old_j_pos_lo); simd_int curr_pos_i = simdi32_set(i); simd_int new_i_pos_hi = simdi_and(lookup_mask_hi,curr_pos_i); simd_int old_i_pos_lo = simdi_and(lookup_mask_lo,i2_vec); i2_vec = simdi32_add(new_i_pos_hi,old_i_pos_lo); score_vec = simdf32_max(sMM_i_j,score_vec); } // end for j } // end for i for(int seq_index=0; seq_index < maxres; seq_index++){ result->score[seq_index]=((float*)&score_vec)[seq_index]; result->i[seq_index] = ((int*)&i2_vec)[seq_index]; result->j[seq_index] = ((int*)&j2_vec)[seq_index]; } // printf("Template=%-12.12s i=%-4i j=%-4i score=%6.3f\n",t->name,i2,j2,score); }
mlib_status FUNC( MxN) ( mlib_image *dst, const mlib_image *src, const mlib_s32 **dmask, mlib_s32 m, mlib_s32 n, mlib_s32 scale, const void *colormap) { mlib_type stype, dtype; const mlib_s32 *dmask0 = dmask[0], *dmask1 = dmask[1], *dmask2 = dmask[2]; mlib_s32 method = mlib_ImageGetMethod(colormap); mlib_u8 *sl, *dl; mlib_s32 schan, dchan, sll, dll, sw, sh, dw, dh, num_blk; mlib_s32 off, off1, kw, mstep, line_size, kern_size, xsize16, i, j, k; __m128i *pbuff, *pb; mlib_u8 *p_dim; mlib_s16 *kern, *pkern; __m128i *dkern; mlib_d64 dscale, dscale0, dscale1, dscale2; __m128i ss, d0, d1, k0, k1; __m128i _s_zero = _mm_xor_si128(_s_zero, _s_zero); mlib_s32 step0, half_step0, v0; mlib_s32 bit_offset = mlib_ImageGetBitOffset(dst); mlib_u8 *p_lut; MLIB_IMAGE_GET_ALL_PARAMS(dst, dtype, dchan, dw, dh, dll, dl); MLIB_IMAGE_GET_ALL_PARAMS(src, stype, schan, sw, sh, sll, sl); p_lut = (mlib_u8 *)mlib_ImageGetLutInversTable(colormap); step0 = abs(p_lut[1] - p_lut[0]); num_blk = (sw + (m - 1)) / m; mstep = m * NCHAN; line_size = (mstep * num_blk + 15) & ~15; xsize16 = (NCHAN * sw + 15) / 16; dscale = 1.0; while (scale > 30) { dscale *= 1.0 / (1 << 30); scale -= 30; } dscale /= (1 << scale); dscale0 = dscale * step0; half_step0 = (step0 - 1) >> 1; kern_size = n * line_size; kern = __mlib_malloc(kern_size * sizeof (mlib_s16)); if (kern == NULL) return (MLIB_FAILURE); for (j = 0; j < n; j++) { for (i = 0; i < m; i++) { pkern = kern + j * line_size + i; v0 = half_step0 - (mlib_s32)(dmask0[j * m + i] * dscale0); for (k = 0; k < num_blk; k++) { pkern[k * mstep] = v0; } } } pbuff = __mlib_malloc(xsize16 * sizeof (__m128i) + 16); if (pbuff == NULL) { __mlib_free(kern); return (MLIB_FAILURE); } pkern = kern; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif for (j = 0; j < sh; j++) { dkern = (__m128i *)pkern; __m128i *sp = (__m128i *)sl; pb = pbuff; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif for (i = 0; i < xsize16; i++) { ss = _mm_loadu_si128(sp); sp++; k0 = _mm_loadu_si128(dkern); dkern++; k1 = _mm_loadu_si128(dkern); dkern++; d0 = _mm_unpacklo_epi8(ss, _s_zero); d1 = _mm_unpackhi_epi8(ss, _s_zero); d0 = _mm_add_epi16(d0, k0); d1 = _mm_add_epi16(d1, k1); d1 = _mm_packus_epi16(d0, d1); _mm_storeu_si128(pb, d1); pb++; } pkern += line_size; if (pkern >= kern + kern_size) pkern = kern; mlib_ImageColorTrue2IndexLine_U8_BIT_1((mlib_u8 *)pbuff, dl, bit_offset, sw, colormap); sl += sll; dl += dll; } __mlib_free(pbuff); __mlib_free(kern); return (MLIB_SUCCESS); }
void vp9_filter_block1d16_h8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pixels_per_line, unsigned char *output_ptr, unsigned int output_pitch, unsigned int output_height, int16_t *filter) { __m128i addFilterReg64, filtersReg, srcReg1, srcReg2; __m128i filt1Reg, filt2Reg, filt3Reg, filt4Reg; __m128i firstFilters, secondFilters, thirdFilters, forthFilters; __m128i srcRegFilt1_1, srcRegFilt2_1, srcRegFilt2, srcRegFilt3; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 = _mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits (first and second byte) // across 128 bit register firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x100u)); // duplicate only the second 16 bits (third and forth byte) // across 128 bit register secondFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x302u)); // duplicate only the third 16 bits (fifth and sixth byte) // across 128 bit register thirdFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x504u)); // duplicate only the forth 16 bits (seventh and eighth byte) // across 128 bit register forthFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi16(0x706u)); filt1Reg = _mm_load_si128((__m128i const *)filt1_global); filt2Reg = _mm_load_si128((__m128i const *)filt2_global); filt3Reg = _mm_load_si128((__m128i const *)filt3_global); filt4Reg = _mm_load_si128((__m128i const *)filt4_global); for (i = 0; i < output_height; i++) { srcReg1 = _mm_loadu_si128((__m128i *)(src_ptr-3)); // filter the source buffer srcRegFilt1_1= _mm_shuffle_epi8(srcReg1, filt1Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1_1 = _mm_maddubs_epi16(srcRegFilt1_1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg1, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg1, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); // reading the next 16 bytes. // (part of it was being read by earlier read) srcReg2 = _mm_loadu_si128((__m128i *)(src_ptr+5)); // add and saturate the results together srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); // filter the source buffer srcRegFilt2_1= _mm_shuffle_epi8(srcReg2, filt1Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt4Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt2_1 = _mm_maddubs_epi16(srcRegFilt2_1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, forthFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, srcRegFilt2); // filter the source buffer srcRegFilt3= _mm_shuffle_epi8(srcReg2, filt2Reg); srcRegFilt2= _mm_shuffle_epi8(srcReg2, filt3Reg); // multiply 2 adjacent elements with the filter and add the result srcRegFilt3 = _mm_maddubs_epi16(srcRegFilt3, secondFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, thirdFilters); // add and saturate the results together srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_min_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, _mm_max_epi16(srcRegFilt3, srcRegFilt2)); srcRegFilt1_1 = _mm_adds_epi16(srcRegFilt1_1, addFilterReg64); srcRegFilt2_1 = _mm_adds_epi16(srcRegFilt2_1, addFilterReg64); // shift by 7 bit each 16 bit srcRegFilt1_1 = _mm_srai_epi16(srcRegFilt1_1, 7); srcRegFilt2_1 = _mm_srai_epi16(srcRegFilt2_1, 7); // shrink to 8 bit each 16 bits, the first lane contain the first // convolve result and the second lane contain the second convolve // result srcRegFilt1_1 = _mm_packus_epi16(srcRegFilt1_1, srcRegFilt2_1); src_ptr+=src_pixels_per_line; // save 16 bytes _mm_store_si128((__m128i*)output_ptr, srcRegFilt1_1); output_ptr+=output_pitch; } }
static int make_frame_planar_yuv_stacked ( lw_video_output_handler_t *vohp, int height, AVFrame *av_frame, PVideoFrame &as_frame ) { as_picture_t dst_picture = { { { NULL } } }; as_picture_t src_picture = { { { NULL } } }; as_assign_planar_yuv( as_frame, &dst_picture ); lw_video_scaler_handler_t *vshp = &vohp->scaler; as_video_output_handler_t *as_vohp = (as_video_output_handler_t *)vohp->private_handler; if( vshp->input_pixel_format == vshp->output_pixel_format ) for( int i = 0; i < 3; i++ ) { src_picture.data [i] = av_frame->data [i]; src_picture.linesize[i] = av_frame->linesize[i]; } else { if( convert_av_pixel_format( vshp->sws_ctx, height, av_frame, &as_vohp->scaled ) < 0 ) return -1; src_picture = as_vohp->scaled; } for( int i = 0; i < 3; i++ ) { const int src_height = height >> (i ? as_vohp->sub_height : 0); const int width = vshp->input_width >> (i ? as_vohp->sub_width : 0); const int width16 = sse2_available > 0 ? (width & ~15) : 0; const int width32 = avx2_available > 0 ? (width & ~31) : 0; const int lsb_offset = src_height * dst_picture.linesize[i]; for( int j = 0; j < src_height; j++ ) { /* Here, if available, use SIMD instructions. * Note: There is assumption that the address of a given data can be divided by 32 or 16. * The destination is always 32 byte alignment unless AviSynth legacy alignment is used. * The source is not always 32 or 16 byte alignment if the frame buffer is from libavcodec directly. */ static const uint8_t LW_ALIGN(32) sp16[32] = { /* saturation protector * For setting all upper 8 bits to zero so that saturation won't make sense. */ 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 ,0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00, 0xFF, 0x00 ,0xFF, 0x00, 0xFF, 0x00 }; uint8_t *dst = dst_picture.data[i] + j * dst_picture.linesize[i]; /* MSB: dst + k, LSB: dst + k + lsb_offset */ const uint8_t *src = src_picture.data[i] + j * src_picture.linesize[i]; /* MSB: src + 2 * k + 1, LSB: src + 2 * k */ const int _width16 = ((intptr_t)src & 15) == 0 ? width16 : 0; /* Don't use SSE2 instructions if set to 0. */ const int _width32 = ((intptr_t)src & 31) == 0 ? width32 : 0; /* Don't use AVX(2) instructions if set to 0. */ #if VC_HAS_AVX2 /* AVX, AVX2 */ for( int k = 0; k < _width32; k += 32 ) { __m256i ymm0 = _mm256_load_si256( (__m256i *)(src + 2 * k ) ); __m256i ymm1 = _mm256_load_si256( (__m256i *)(src + 2 * k + 32) ); __m256i mask = _mm256_load_si256( (__m256i *)sp16 ); __m256i ymm2 = _mm256_packus_epi16( _mm256_and_si256 ( ymm0, mask ), _mm256_and_si256 ( ymm1, mask ) ); __m256i ymm3 = _mm256_packus_epi16( _mm256_srli_epi16( ymm0, 8 ), _mm256_srli_epi16( ymm1, 8 ) ); _mm256_store_si256( (__m256i *)(dst + k + lsb_offset), _mm256_permute4x64_epi64( ymm2, _MM_SHUFFLE( 3, 1, 2, 0 ) ) ); _mm256_store_si256( (__m256i *)(dst + k ), _mm256_permute4x64_epi64( ymm3, _MM_SHUFFLE( 3, 1, 2, 0 ) ) ); } #endif /* SSE2 */ for( int k = _width32; k < _width16; k += 16 ) { __m128i xmm0 = _mm_load_si128( (__m128i *)(src + 2 * k ) ); __m128i xmm1 = _mm_load_si128( (__m128i *)(src + 2 * k + 16) ); __m128i mask = _mm_load_si128( (__m128i *)sp16 ); _mm_store_si128( (__m128i *)(dst + k + lsb_offset), _mm_packus_epi16( _mm_and_si128 ( xmm0, mask ), _mm_and_si128 ( xmm1, mask ) ) ); _mm_store_si128( (__m128i *)(dst + k ), _mm_packus_epi16( _mm_srli_epi16( xmm0, 8 ), _mm_srli_epi16( xmm1, 8 ) ) ); } for( int k = _width16; k < width; k++ ) { *(dst + k + lsb_offset) = *(src + 2 * k ); *(dst + k ) = *(src + 2 * k + 1); } } } return 0; }
void vp9_filter_block1d4_h8_intrin_ssse3(unsigned char *src_ptr, unsigned int src_pixels_per_line, unsigned char *output_ptr, unsigned int output_pitch, unsigned int output_height, int16_t *filter) { __m128i firstFilters, secondFilters, shuffle1, shuffle2; __m128i srcRegFilt1, srcRegFilt2, srcRegFilt3, srcRegFilt4; __m128i addFilterReg64, filtersReg, srcReg, minReg; unsigned int i; // create a register with 0,64,0,64,0,64,0,64,0,64,0,64,0,64,0,64 addFilterReg64 =_mm_set1_epi32((int)0x0400040u); filtersReg = _mm_loadu_si128((__m128i *)filter); // converting the 16 bit (short) to 8 bit (byte) and have the same data // in both lanes of 128 bit register. filtersReg =_mm_packs_epi16(filtersReg, filtersReg); // duplicate only the first 16 bits in the filter into the first lane firstFilters = _mm_shufflelo_epi16(filtersReg, 0); // duplicate only the third 16 bit in the filter into the first lane secondFilters = _mm_shufflelo_epi16(filtersReg, 0xAAu); // duplicate only the seconds 16 bits in the filter into the second lane // firstFilters: k0 k1 k0 k1 k0 k1 k0 k1 k2 k3 k2 k3 k2 k3 k2 k3 firstFilters = _mm_shufflehi_epi16(firstFilters, 0x55u); // duplicate only the forth 16 bits in the filter into the second lane // secondFilters: k4 k5 k4 k5 k4 k5 k4 k5 k6 k7 k6 k7 k6 k7 k6 k7 secondFilters = _mm_shufflehi_epi16(secondFilters, 0xFFu); // loading the local filters shuffle1 =_mm_load_si128((__m128i const *)filt1_4_h8); shuffle2 = _mm_load_si128((__m128i const *)filt2_4_h8); for (i = 0; i < output_height; i++) { srcReg = _mm_loadu_si128((__m128i *)(src_ptr-3)); // filter the source buffer srcRegFilt1= _mm_shuffle_epi8(srcReg, shuffle1); srcRegFilt2= _mm_shuffle_epi8(srcReg, shuffle2); // multiply 2 adjacent elements with the filter and add the result srcRegFilt1 = _mm_maddubs_epi16(srcRegFilt1, firstFilters); srcRegFilt2 = _mm_maddubs_epi16(srcRegFilt2, secondFilters); // extract the higher half of the lane srcRegFilt3 = _mm_srli_si128(srcRegFilt1, 8); srcRegFilt4 = _mm_srli_si128(srcRegFilt2, 8); minReg = _mm_min_epi16(srcRegFilt3, srcRegFilt2); // add and saturate all the results together srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt4); srcRegFilt3 = _mm_max_epi16(srcRegFilt3, srcRegFilt2); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, minReg); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, srcRegFilt3); srcRegFilt1 = _mm_adds_epi16(srcRegFilt1, addFilterReg64); // shift by 7 bit each 16 bits srcRegFilt1 = _mm_srai_epi16(srcRegFilt1, 7); // shrink to 8 bit each 16 bits srcRegFilt1 = _mm_packus_epi16(srcRegFilt1, srcRegFilt1); src_ptr+=src_pixels_per_line; // save only 4 bytes *((int*)&output_ptr[0])= _mm_cvtsi128_si32(srcRegFilt1); output_ptr+=output_pitch; } }
void LW_FUNC_ALIGN convert_lw48_to_yuy2_sse41( int thread_id, int thread_num, void *param1, void *param2 ) { /* LW48 -> YUY2 using SSE4.1 */ COLOR_PROC_INFO *cpip = (COLOR_PROC_INFO *)param1; int start = (cpip->h * thread_id ) / thread_num; int end = (cpip->h * (thread_id + 1)) / thread_num; int w = cpip->w; BYTE *ycp_line = (BYTE *)cpip->ycp + start * cpip->line_size; BYTE *pixel_line = (BYTE *)cpip->pixelp + start * w * 2; __m128i x0, x1, x2, x3, x5, x6, x7; static const char LW_ALIGN(16) SHUFFLE_Y[16] = { 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11 }; for( int y = start; y < end; y++ ) { BYTE *ycp = ycp_line; BYTE *yuy2_ptr = pixel_line; for( int x = 0, i_step = 0; x < w; x += i_step, ycp += i_step*6, yuy2_ptr += i_step*2 ) { x5 = _mm_loadu_si128((__m128i *)(ycp + 0)); x6 = _mm_loadu_si128((__m128i *)(ycp + 16)); x7 = _mm_loadu_si128((__m128i *)(ycp + 32)); x0 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02); x0 = _mm_blend_epi16(x0, x7, 0x20+0x04); x1 = _mm_blend_epi16(x5, x6, 0x40+0x20+0x01); x1 = _mm_blend_epi16(x1, x7, 0x10+0x08); x0 = _mm_shuffle_epi8(x0, _mm_load_si128((__m128i*)SHUFFLE_Y)); x1 = _mm_alignr_epi8(x1, x1, 2); x1 = _mm_shuffle_epi32(x1, _MM_SHUFFLE(1,2,3,0)); x0 = _mm_srli_epi16(x0, 8); x1 = _mm_srli_epi16(x1, 8); x5 = _mm_loadu_si128((__m128i *)(ycp + 48)); x6 = _mm_loadu_si128((__m128i *)(ycp + 64)); x7 = _mm_loadu_si128((__m128i *)(ycp + 80)); x2 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02); x2 = _mm_blend_epi16(x2, x7, 0x20+0x04); x3 = _mm_blend_epi16(x5, x6, 0x40+0x20+0x01); x3 = _mm_blend_epi16(x3, x7, 0x10+0x08); x2 = _mm_shuffle_epi8(x2, _mm_load_si128((__m128i*)SHUFFLE_Y)); x3 = _mm_alignr_epi8(x3, x3, 2); x3 = _mm_shuffle_epi32(x3, _MM_SHUFFLE(1,2,3,0)); x2 = _mm_srli_epi16(x2, 8); x3 = _mm_srli_epi16(x3, 8); x0 = _mm_packus_epi16(x0, x2); x1 = _mm_packus_epi16(x1, x3); _mm_storeu_si128((__m128i*)(yuy2_ptr + 0), _mm_unpacklo_epi8(x0, x1)); _mm_storeu_si128((__m128i*)(yuy2_ptr + 16), _mm_unpackhi_epi8(x0, x1)); int remain = w - x; i_step = (remain >= 16); i_step = (i_step<<4) + (remain & ((~(0-i_step)) & 0x0f)); } ycp_line += cpip->line_size; pixel_line += w*2; } }