static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 8; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set1_epi16(1 << 7); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i v1 = _mm_unpacklo_epi8(v0, zero); const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); const __m128i v3 = _mm_mullo_epi16(v1, alpha1); const __m128i v4 = _mm_adds_epu16(v2, v3); const __m128i v5 = _mm_adds_epu16(v4, kRound); const __m128i v6 = _mm_srli_epi16(v5, 8); const __m128i v7 = _mm_packus_epi16(v6, zero); _mm_storel_epi64((__m128i*)&ptr[x], v7); } } width -= x; if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); }
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0, __m128i *sum_1) { const __m128i zero = _mm_setzero_si128(); const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a); const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b); const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8); const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero); const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8); const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero); const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16); const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16); const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16); const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16); __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2); // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8]. __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2); __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum_0 = sum_u16; shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14); shift_right = _mm_srli_si128(diff_sq_1_u16, 2); sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum_1 = sum_u16; }
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) { __m128i s0, s1, u0; unsigned int avg = 0; u0 = _mm_setzero_si128(); s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0); s0 = _mm_adds_epu16(s0, s1); s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8)); s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32)); s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16)); avg = _mm_extract_epi16(s0, 0); return (avg + 32) >> 6; }
// Load values from 'a' and 'b'. Compute the difference squared and sum // neighboring values such that: // sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2 // Values to the left and right of the row are set to 0. // The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values. static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) { const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a); const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b); const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8); const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8); const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16); const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16); // Shift all the values one place to the left/right so we can efficiently sum // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1]. const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2); const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2); // It becomes necessary to treat the values as unsigned at this point. The // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point // forward since the filter is only applied to smooth small pixel changes. // Once the value has saturated to uint16_t it is well outside the useful // range. __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left); sum_u16 = _mm_adds_epu16(sum_u16, shift_right); *sum = sum_u16; }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
//! \brief //! Divide 8 16-bit uints by 255: //! x := ((x + 1) + (x >> 8)) >> 8: //! See: http://www.alfredklomp.com/programming/sse-intrinsics/ //! inline __m128i _mm_div255_epu16(__m128i x) { return _mm_srli_epi16(_mm_adds_epu16( _mm_adds_epu16(x, _mm_set1_epi16(1)), _mm_srli_epi16(x, 8)), 8); }
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, const __m128i mul_constants_0, const __m128i mul_constants_1, const int strength, const int rounding, const int weight) { const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); __m128i input_0, input_1; input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); input_0 = _mm_adds_epu16(input_0, rounding_u16); input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); input_1 = _mm_adds_epu16(input_1, rounding_u16); input_0 = _mm_srl_epi16(input_0, strength_u128); input_1 = _mm_srl_epi16(input_1, strength_u128); input_0 = _mm_min_epu16(input_0, sixteen); input_1 = _mm_min_epu16(input_1, sixteen); input_0 = _mm_sub_epi16(sixteen, input_0); input_1 = _mm_sub_epi16(sixteen, input_1); *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); }
static void GF_FUNC_ALIGN VS_CC proc_9_10_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s, int th) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *orig = p0, *end = p2; line_copy16(p0, srcp + stride, width, 1); line_copy16(p1, srcp, width, 1); int16_t threshold = (int16_t)th; __m128i zero = _mm_setzero_si128(); __m128i xth = _mm_set1_epi16(threshold); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 1 ? 1 : -1); line_copy16(p2, srcp, width, 1); uint16_t *coordinates[] = COORDINATES; for (int x = 0; x < width; x += 8) { __m128i sum = zero; for (int i = 0; i < 8; i++) { __m128i xmm0 = _mm_loadu_si128((__m128i *)(coordinates[i] + x)); sum = _mm_adds_epu16(sum, xmm0); } sum = _mm_srai_epi16(sum, 3); __m128i src = _mm_load_si128((__m128i *)(p1 + x)); __m128i limit = _mm_adds_epu16(src, xth); sum = MM_MAX_EPU16(sum, src); sum = MM_MIN_EPU16(sum, limit); _mm_store_si128((__m128i *)(dstp + x), sum); } dstp += stride; p0 = p1; p1 = p2; p2 = (p2 == end) ? orig : p2 + bstride; } }
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.' static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred, uint16_t *count, uint32_t *accumulator) { const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred); const __m128i zero = _mm_setzero_si128(); __m128i count_u16 = _mm_loadu_si128((const __m128i *)count); __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8); __m128i pred_0_u32, pred_1_u32; __m128i accum_0_u32, accum_1_u32; count_u16 = _mm_adds_epu16(count_u16, sum_u16); _mm_storeu_si128((__m128i *)count, count_u16); pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16); pred_0_u32 = _mm_cvtepu16_epi32(pred_u16); pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero); accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator); accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4)); accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32); accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32); _mm_storeu_si128((__m128i *)accumulator, accum_0_u32); _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32); }
static inline __m128i max_epu16(__m128i a, __m128i b) { a = _mm_subs_epu16 (a, b); b = _mm_adds_epu16 (b, a); return b; }
__m128i test_mm_adds_epu16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_adds_epu16 // DAG: call <8 x i16> @llvm.x86.sse2.paddus.w // // ASM-LABEL: test_mm_adds_epu16 // ASM: paddusw return _mm_adds_epu16(A, B); }
SIMDValue SIMDUint16x8Operation::OpAddSaturate(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128i_value = _mm_adds_epu16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b saturated return X86SIMDValue::ToSIMDValue(x86Result); }
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0, const Uint8* source1, Uint8* dest) { __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = _mm_load_si128((__m128i*)&source0[i * 16]); t1 = _mm_load_si128((__m128i*)&source1[i * 16]); t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t2 = _mm_unpacklo_epi8(t2, t2); t2 = _mm_unpacklo_epi16(t2, t2); t3 = _mm_unpacklo_epi8(t0, t0); t4 = _mm_unpacklo_epi8(t1, t1); t5 = _mm_unpacklo_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t9 = _mm_adds_epu16(t7, t8); t9 = _mm_srli_epi16(t9, 8); t3 = _mm_unpackhi_epi8(t0, t0); t4 = _mm_unpackhi_epi8(t1, t1); t5 = _mm_unpackhi_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t10 = _mm_adds_epu16(t7, t8); t10 = _mm_srli_epi16(t10, 8); t10 = _mm_packus_epi16(t9, t10); _mm_stream_si128((__m128i*)&dest[i * 16], t10); } }
__m64 _m_paddusw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_adds_epu16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
// These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0, const __m128i* const U0, const __m128i* const V0, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic const __m128i k33050 = _mm_set1_epi16((short)33050); const __m128i k17685 = _mm_set1_epi16(17685); const __m128i k6419 = _mm_set1_epi16(6419); const __m128i k13320 = _mm_set1_epi16(13320); const __m128i k8708 = _mm_set1_epi16(8708); const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); const __m128i R1 = _mm_sub_epi16(Y1, k14234); const __m128i R2 = _mm_add_epi16(R1, R0); const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); const __m128i G2 = _mm_add_epi16(Y1, k8708); const __m128i G3 = _mm_add_epi16(G0, G1); const __m128i G4 = _mm_sub_epi16(G2, G3); // be careful with the saturated *unsigned* arithmetic here! const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); const __m128i B1 = _mm_adds_epu16(B0, Y1); const __m128i B2 = _mm_subs_epu16(B1, k17685); // use logical shift for B2, which can be larger than 32767 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] }
// Average the value based on the number of values summed (9 for pixels away // from the border, 4 for pixels in corners, and 6 for other edge values). // // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply // by weight. static __m128i average_8(__m128i sum, const __m128i mul_constants, const int strength, const int rounding, const int weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; sum = _mm_mulhi_epu16(sum, mul_constants); sum = _mm_adds_epu16(sum, rounding_u16); sum = _mm_srl_epi16(sum, strength_u128); // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 // So this needs to use the epu16 version which did not come until SSE4. sum = _mm_min_epu16(sum, sixteen); sum = _mm_sub_epi16(sixteen, sum); return _mm_mullo_epi16(sum, weight_u16); }
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride, const uint8_t *b, unsigned int width, unsigned int height, int strength, int weight, uint32_t *accumulator, uint16_t *count) { unsigned int h; const int rounding = strength > 0 ? 1 << (strength - 1) : 0; assert(strength >= 0); assert(strength <= 6); assert(weight >= 0); assert(weight <= 2); assert(width == 8 || width == 16); if (width == 8) { __m128i sum_row_a, sum_row_b, sum_row_c; __m128i mul_constants = _mm_setr_epi16( NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_8(a, b, &sum_row_a); sum_8(a + stride, b + width, &sum_row_b); sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b); sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight); accumulate_and_store_8(sum_row_c, b, count, accumulator); a += stride + stride; b += width; count += width; accumulator += width; mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); for (h = 0; h < height - 2; ++h) { sum_8(a, b + width, &sum_row_c); sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c); sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); accumulate_and_store_8(sum_row_a, b, count, accumulator); a += stride; b += width; count += width; accumulator += width; sum_row_a = sum_row_b; sum_row_b = sum_row_c; } mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b); sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight); accumulate_and_store_8(sum_row_a, b, count, accumulator); } else { // width == 16 __m128i sum_row_a_0, sum_row_a_1; __m128i sum_row_b_0, sum_row_b_1; __m128i sum_row_c_0, sum_row_c_1; __m128i mul_constants_0 = _mm_setr_epi16( NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6), mul_constants_1 = _mm_setr_epi16( NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_16(a, b, &sum_row_a_0, &sum_row_a_1); sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1); sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, strength, rounding, weight); accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); a += stride + stride; b += width; count += width; accumulator += width; mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9); mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6); for (h = 0; h < height - 2; ++h) { sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1); sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0); sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1); average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1, strength, rounding, weight); accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator); a += stride; b += width; count += width; accumulator += width; sum_row_a_0 = sum_row_b_0; sum_row_a_1 = sum_row_b_1; sum_row_b_0 = sum_row_c_0; sum_row_b_1 = sum_row_c_1; } mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6); mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4); sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0); sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1); average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1, strength, rounding, weight); accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator); } }
int global_sse2_word(int queryLength, unsigned short *profile, const unsigned char *dbSeq, int dbLength, unsigned short gapOpen, unsigned short gapExtend, unsigned short ceiling, struct f_struct *f_str) { int i, j; int score; int scale; int temp; int distance; int offset; int position; int cmp; int iter; __m128i *pvH; __m128i *pvE; __m128i vE, vF, vH; __m128i vHNext; __m128i vFPrev; __m128i vGapOpen; __m128i vGapExtend; __m128i vCeiling; __m128i vScale; __m128i vScaleAmt; __m128i vScaleTmp; __m128i vTemp; __m128i vNull; __m128i *pvScore; scale = 0; iter = (queryLength + 7) / 8; offset = (queryLength - 1) % iter; position = 7 - (queryLength - 1) / iter; pvH = (__m128i *)f_str->workspace; pvE = pvH + iter; /* Load gap opening penalty to all elements of a constant */ vGapOpen = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0); vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0); vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0); /* Load gap extension penalty to all elements of a constant */ vGapExtend = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0); vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0); vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0); /* Generate the ceiling before scaling */ vTemp = _mm_setzero_si128(); /* transfered from Apple Devel smith_waterman_sse2.c fix */ vTemp = _mm_insert_epi16 (vTemp, ceiling, 0); vTemp = _mm_shufflelo_epi16 (vTemp, 0); vTemp = _mm_shuffle_epi32 (vTemp, 0); vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp); vCeiling = _mm_srli_epi16 (vCeiling, 1); vCeiling = _mm_subs_epi16 (vCeiling, vTemp); vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen); vNull = _mm_cmpeq_epi16 (vTemp, vTemp); vNull = _mm_slli_epi16 (vNull, 15); vScaleAmt = _mm_xor_si128 (vNull, vNull); /* Zero out the storage vector */ vTemp = _mm_adds_epi16 (vNull, vGapOpen); for (i = 0; i < iter; i++) { _mm_store_si128 (pvH + i, vTemp); _mm_store_si128 (pvE + i, vNull); } /* initialize F */ vF = vNull; vFPrev = vNull; /* load and scale H for the next round */ vTemp = _mm_srli_si128 (vGapOpen, 14); vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_adds_epi16 (vH, vTemp); for (i = 0; i < dbLength; ++i) { /* fetch first data asap. */ pvScore = (__m128i *) profile + dbSeq[i] * iter; vF = vNull; vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < iter; j++) { /* correct H from the previous columns F */ vHNext = _mm_load_si128 (pvH + j); vHNext = _mm_max_epi16 (vHNext, vFPrev); /* load and correct E value */ vE = _mm_load_si128 (pvE + j); vTemp = _mm_subs_epi16 (vHNext, vGapOpen); vE = _mm_max_epi16 (vE, vTemp); _mm_store_si128 (pvE + j, vE); /* add score to vH */ vH = _mm_adds_epi16 (vH, *pvScore++); /* get max from vH, vE and vF */ vH = _mm_max_epi16 (vH, vE); vH = _mm_max_epi16 (vH, vF); _mm_store_si128 (pvH + j, vH); /* update vF value */ vH = _mm_subs_epi16 (vH, vGapOpen); vF = _mm_max_epi16 (vF, vH); /* load the next h values */ vH = vHNext; } /* check if we need to scale before the next round */ vTemp = _mm_cmpgt_epi16 (vF, vCeiling); cmp = _mm_movemask_epi8 (vTemp); /* broadcast F values */ vF = _mm_xor_si128 (vF, vNull); vTemp = _mm_slli_si128 (vF, 2); vTemp = _mm_subs_epu16 (vTemp, vScaleAmt); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vF, 4); vScaleTmp = _mm_slli_si128 (vScaleAmt, 2); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); vTemp = _mm_slli_si128 (vScaleTmp, 4); vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp); vTemp = _mm_slli_si128 (vF, 8); vTemp = _mm_subs_epu16 (vTemp, vScaleTmp); vF = max_epu16 (vF, vTemp); /* scale if necessary */ if (cmp != 0x0000) { __m128i vScale1; __m128i vScale2; vScale = _mm_slli_si128 (vF, 2); vScale = _mm_subs_epu16 (vScale, vGapOpen); vScale = _mm_subs_epu16 (vScale, vScaleAmt); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vScale, vTemp); vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp); vTemp = _mm_slli_si128 (vScale, 2); vTemp = _mm_subs_epu16 (vTemp, vScale); vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp); /* rescale the previous F */ vF = _mm_subs_epu16 (vF, vScale); /* check if we can continue in signed 16-bits */ vTemp = _mm_xor_si128 (vF, vNull); vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling); cmp = _mm_movemask_epi8 (vTemp); if (cmp != 0x0000) { return OVERFLOW_SCORE; } vTemp = _mm_adds_epi16 (vCeiling, vCeiling); vScale1 = _mm_subs_epu16 (vScale, vTemp); vScale2 = _mm_subs_epu16 (vScale, vScale1); /* scale all the vectors */ for (j = 0; j < iter; j++) { /* load H and E */ vH = _mm_load_si128 (pvH + j); vE = _mm_load_si128 (pvE + j); /* get max from vH, vE and vF */ vH = _mm_subs_epi16 (vH, vScale1); vH = _mm_subs_epi16 (vH, vScale2); vE = _mm_subs_epi16 (vE, vScale1); vE = _mm_subs_epi16 (vE, vScale2); /* save the H and E */ _mm_store_si128 (pvH + j, vH); _mm_store_si128 (pvE + j, vE); } vScale = vScaleAmt; for (j = 0; j < position; ++j) { vScale = _mm_slli_si128 (vScale, 2); } /* calculate the final scaling amount */ vTemp = _mm_xor_si128 (vTemp, vTemp); vScale1 = _mm_unpacklo_epi16 (vScale, vTemp); vScale2 = _mm_unpackhi_epi16 (vScale, vTemp); vScale = _mm_add_epi32 (vScale1, vScale2); vTemp = _mm_srli_si128 (vScale, 8); vScale = _mm_add_epi32 (vScale, vTemp); vTemp = _mm_srli_si128 (vScale, 4); vScale = _mm_add_epi32 (vScale, vTemp); scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0); temp = (int) (unsigned short) _mm_extract_epi16 (vScale, 1); scale = scale + (temp << 16); } /* scale the F value for the next round */ vFPrev = _mm_slli_si128 (vF, 2); vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt); vFPrev = _mm_xor_si128 (vFPrev, vNull); /* load and scale H for the next round */ vH = _mm_load_si128 (pvH + iter - 1); vH = _mm_xor_si128 (vH, vNull); vH = _mm_slli_si128 (vH, 2); vH = _mm_subs_epu16 (vH, vScaleAmt); vH = _mm_insert_epi16 (vH, gapOpen, 0); vH = _mm_xor_si128 (vH, vNull); } vH = _mm_load_si128 (pvH + offset); vH = _mm_max_epi16 (vH, vFPrev); for (j = 0; j < position; ++j) { vH = _mm_slli_si128 (vH, 2); } score = (int) (signed short) _mm_extract_epi16 (vH, 7); score = score + SHORT_BIAS; /* return largest score */ distance = (queryLength + dbLength) * gapExtend; score = score - (gapOpen * 2) - distance + scale; return score; }
void av1_highbd_jnt_convolve_2d_copy_sse4_1( const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w, int h, const InterpFilterParams *filter_params_x, const InterpFilterParams *filter_params_y, const int subpel_x_q4, const int subpel_y_q4, ConvolveParams *conv_params, int bd) { CONV_BUF_TYPE *dst = conv_params->dst; int dst_stride = conv_params->dst_stride; (void)filter_params_x; (void)filter_params_y; (void)subpel_x_q4; (void)subpel_y_q4; const int bits = FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0; const __m128i left_shift = _mm_cvtsi32_si128(bits); const int do_average = conv_params->do_average; const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg; const int w0 = conv_params->fwd_offset; const int w1 = conv_params->bck_offset; const __m128i wt0 = _mm_set1_epi32(w0); const __m128i wt1 = _mm_set1_epi32(w1); const __m128i zero = _mm_setzero_si128(); int i, j; const int offset_0 = bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const int offset = (1 << offset_0) + (1 << (offset_0 - 1)); const __m128i offset_const = _mm_set1_epi32(offset); const __m128i offset_const_16b = _mm_set1_epi16(offset); const int rounding_shift = 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1; const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1); const __m128i clip_pixel_to_bd = _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255)); assert(bits <= 4); if (!(w % 8)) { for (i = 0; i < h; i += 1) { for (j = 0; j < w; j += 8) { const __m128i src_16bit = _mm_loadu_si128((__m128i *)(&src[i * src_stride + j])); const __m128i res = _mm_sll_epi16(src_16bit, left_shift); if (do_average) { const __m128i data_0 = _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero); const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero); const __m128i res_unsigned_lo = _mm_add_epi32(res_32b_lo, offset_const); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); const __m128i res_unsigned_hi = _mm_add_epi32(res_32b_hi, offset_const); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); } else { const __m128i res_unsigned_16b = _mm_adds_epu16(res, offset_const_16b); _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]), res_unsigned_16b); } } } } else if (!(w % 4)) { for (i = 0; i < h; i += 2) { for (j = 0; j < w; j += 4) { const __m128i src_row_0 = _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j])); const __m128i src_row_1 = _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride])); const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1); const __m128i res = _mm_sll_epi16(src_10, left_shift); if (do_average) { const __m128i data_0 = _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j])); const __m128i data_1 = _mm_loadl_epi64( (__m128i *)(&dst[i * dst_stride + j + dst_stride])); const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero); const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero); const __m128i res_32b = _mm_unpacklo_epi16(res, zero); const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const); const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero); const __m128i res_unsigned_hi = _mm_add_epi32(res_32b_hi, offset_const); const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1( &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg); const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1( &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg); const __m128i round_result_lo = highbd_convolve_rounding_sse2( &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift); const __m128i round_result_hi = highbd_convolve_rounding_sse2( &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift); const __m128i res_16b = _mm_packus_epi32(round_result_lo, round_result_hi); const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd); const __m128i res_1 = _mm_srli_si128(res_clip, 8); _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip); _mm_storel_epi64( (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1); } else { const __m128i res_unsigned_16b = _mm_adds_epu16(res, offset_const_16b); const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]), res_unsigned_16b); _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]), res_1); } } } } }
static FORCE_INLINE __m128i mm_adds_epu(const __m128i &a, const __m128i &b) { if (sizeof(PixelType) == 1) return _mm_adds_epu8(a, b); else return _mm_adds_epu16(a, b); }
static void GF_FUNC_ALIGN VS_CC proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *dstp, const uint8_t *srcp, edge_t *eh, uint16_t plane_max) { uint8_t* p0 = buff + 16; uint8_t* p1 = p0 + bstride; uint8_t* p2 = p1 + bstride; uint8_t* p3 = p2 + bstride; uint8_t* p4 = p3 + bstride; uint8_t* orig = p0; uint8_t* end = p4; line_copy8(p0, srcp + 2 * stride, width, 2); line_copy8(p1, srcp + stride, width, 2); line_copy8(p2, srcp, width, 2); srcp += stride; line_copy8(p3, srcp, width, 2); uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min; uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max; __m128i zero = _mm_setzero_si128(); __m128i ab = _mm_set1_epi16(15); __m128i max = _mm_set1_epi8((int8_t)th_max); __m128i min = _mm_set1_epi8((int8_t)th_min); for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy8(p4, srcp, width, 2); uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2}; uint8_t* posv[] = {p0, p1, p3, p4}; for (int x = 0; x < width; x += 16) { __m128i sumx[2] = {zero, zero}; __m128i sumy[2] = {zero, zero}; for (int i = 0; i < 4; i++) { __m128i xmm0, xmm1, xmul; xmul = _mm_load_si128((__m128i *)ar_mulx[i]); xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul)); sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul)); xmul = _mm_load_si128((__m128i *)ar_muly[i]); xmm0 = _mm_load_si128((__m128i *)(posv[i] + x)); xmm1 = _mm_unpackhi_epi8(xmm0, zero); xmm0 = _mm_unpacklo_epi8(xmm0, zero); sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul)); sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul)); } for (int i = 0; i < 2; i++) { __m128i xmax, xmin, mull, mulh; sumx[i] = mm_abs_epi16(sumx[i]); sumy[i] = mm_abs_epi16(sumy[i]); xmax = _mm_max_epi16(sumx[i], sumy[i]); xmin = _mm_min_epi16(sumx[i], sumy[i]); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4); xmax = mm_cast_epi32(mull, mulh); mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5); mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5); xmin = mm_cast_epi32(mull, mulh); sumx[i] = _mm_adds_epu16(xmax, xmin); sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift); } __m128i out = _mm_packus_epi16(sumx[0], sumx[1]); __m128i temp = _mm_min_epu8(out, max); temp = _mm_cmpeq_epi8(temp, max); out = _mm_or_si128(temp, out); temp = _mm_max_epu8(out, min); temp = _mm_cmpeq_epi8(temp, min); out = _mm_andnot_si128(temp, out); _mm_store_si128((__m128i*)(dstp + x), out); } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
void mlib_s_ImageBlendLine( mlib_work_image * param, mlib_u8 *dp, __m128i * buffz, __m128i * buffd) { mlib_blend blend = param->blend; mlib_s32 chan_d = param->chan_d; mlib_s32 chan_s = param->channels; mlib_d64 alp = (param->alpha) * (1.0 / 255); mlib_s32 width = GetElemSubStruct(current, width); mlib_u8 *tdp = dp; mlib_s32 width2, y_step, next_step = 2; mlib_s32 alp_ind = param->alp_ind, mask255; __m128i aa, dalp, done; __m128i mzero, mask_7fff, mask_8000, amask, amask256, amaskffff; __m128i d_rnd; mlib_s32 i, j; if (!alp_ind) { d_rnd = _mm_set1_epi16(0x0080); tdp = (void *)dp; if (chan_d == 3) tdp = (void *)buffd; for (i = 0; i < width / 2; i++) { __m128i dd; dd = buffz[i]; dd = _mm_adds_epu16(dd, d_rnd); dd = _mm_srli_epi16(dd, 8); dd = _mm_packus_epi16(dd, dd); _mm_storel_epi64((void *)(tdp + 8 * i), dd); } if (width & 1) { __m128i dd; dd = buffz[i]; dd = _mm_adds_epu16(dd, d_rnd); dd = _mm_srli_epi16(dd, 8); dd = _mm_packus_epi16(dd, dd); *(mlib_s32 *)(tdp + 8 * i) = *(mlib_s32 *)ⅆ } if (chan_d == 3) { mlib_s_ImageChannelExtract_U8_43L_D1((void *)buffd, dp, width); } return; } width2 = (width + 1) / 2; mzero = _mm_setzero_si128(); mask_7fff = _mm_set1_epi16(0x7FFF); mask_8000 = _mm_set1_epi16(0x8000); done = _mm_set1_epi16(1 << 15); if (alp_ind == -1) { mask255 = 0xFF; amask = _mm_setr_epi32(0xff00, 0, 0xff00, 0); amaskffff = _mm_setr_epi32(0xffff, 0, 0xffff, 0); amask256 = _mm_setr_epi32(0x0100, 0, 0x0100, 0); } else { mask255 = 0xFF000000; amask = _mm_setr_epi32(0, 0xff000000, 0, 0xff000000); amaskffff = _mm_setr_epi32(0, 0xffff0000, 0, 0xffff0000); amask256 = _mm_setr_epi32(0, 0x01000000, 0, 0x01000000); } dalp = _mm_set1_epi16((1 << 15) * alp + 0.5); if (chan_s == 3) { if (chan_d == 3) { mlib_d64 alp = (param->alpha) * (1.0 / 255); mlib_s32 ialp; mlib_u8 *pz; __m128i emask; __m128i dalp, ralp, ss, dd, s0, s1, d0, d1, dr; mlib_s_ImageChannelExtract_S16_43L_D1((void *)buffz, (void *)buffd, width); ialp = alp * (1 << 15); dalp = _mm_set1_epi16(ialp); ralp = _mm_set1_epi16((1 << 15) - ialp); emask = mlib_emask_m128i[(3 * width) & 15].m128i; pz = (void *)buffd; tdp = dp; for (i = 0; i <= 3 * width - 16; i += 16) { s0 = _mm_load_si128((__m128i *) (pz + 2 * i)); s1 = _mm_load_si128((__m128i *) (pz + 2 * i + 16)); dd = _mm_loadu_si128((__m128i *) (tdp + i)); d0 = _mm_unpacklo_epi8(mzero, dd); d1 = _mm_unpackhi_epi8(mzero, dd); d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp), _mm_mulhi_epu16(d0, ralp)); d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp), _mm_mulhi_epu16(d1, ralp)); d0 = _mm_srli_epi16(d0, 7); d1 = _mm_srli_epi16(d1, 7); dr = _mm_packus_epi16(d0, d1); _mm_storeu_si128((__m128i *) (tdp + i), dr); } if (i < 3 * width) { s0 = _mm_load_si128((__m128i *) (pz + 2 * i)); s1 = _mm_load_si128((__m128i *) (pz + 2 * i + 16)); dd = _mm_loadu_si128((__m128i *) (tdp + i)); d0 = _mm_unpacklo_epi8(mzero, dd); d1 = _mm_unpackhi_epi8(mzero, dd); d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp), _mm_mulhi_epu16(d0, ralp)); d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp), _mm_mulhi_epu16(d1, ralp)); d0 = _mm_srli_epi16(d0, 7); d1 = _mm_srli_epi16(d1, 7); dr = _mm_packus_epi16(d0, d1); dr = _mm_or_si128(_mm_and_si128(emask, dr), _mm_andnot_si128(emask, dd)); _mm_storeu_si128((__m128i *) (tdp + i), dr); } } else if (blend == MLIB_BLEND_GTK_SRC) { mlib_u8 *buffi = (mlib_u8 *)buffz + 1; for (i = 0; i < width; i++) { tdp[0] = buffi[0]; tdp[1] = buffi[2]; tdp[2] = buffi[4]; tdp[alp_ind] = 255; tdp += 4; buffi += 8; } } else { mlib_d64 _w0 = param->alpha; mlib_d64 _w1s = 1.0 - _w0 * (1.0 / 255); __m128i buff[1]; __m128i done; __m128i dalp, ralp, ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr; __m128i wi, aa, amask; __m128 af, w0, w1, w1s, w, rw, w0r, w1r, scale; done = _mm_set1_epi16(1 << 15); amask = _mm_set1_epi32(mask255); w0 = _mm_set_ps1(_w0); w1s = _mm_set_ps1(_w1s); scale = _mm_set_ps1(1 << 15); if (alp_ind == -1) { tdp--; for (i = 0; i < width / 4; i++) { BLEND34_SRC_OVER(0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND34_SRC_OVER(0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND34_SRC_OVER(3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND34_SRC_OVER(3); buff[0] = dr; } } for (i = 0; i < (width & 3); i++) { ((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i]; } } } else if (chan_d == 3) { if (blend != MLIB_BLEND_GTK_SRC) { if (alp_ind == -1) { tdp--; } for (i = 0; i < width; i++) { ((mlib_s32 *)buffd)[i] = *(mlib_s32 *)(tdp + 3 * i); } if (alp_ind == -1) { for (i = 0; i < width2; i++) { __m128i a0, s0, d0, dd; BLEND43_SRC_OVER(0); } mlib_s_ImageChannelExtract_U8_43R_D1((void *) buffd, dp, width); } else { for (i = 0; i < width2; i++) { __m128i a0, s0, d0, dd; BLEND43_SRC_OVER(0xff); } mlib_s_ImageChannelExtract_U8_43L_D1((void *) buffd, dp, width); } } else { mlib_u8 *buffi = (mlib_u8 *)buffz + 1; if (alp_ind == -1) buffi += 2; for (i = 0; i < width; i++) { tdp[0] = buffi[0]; tdp[1] = buffi[2]; tdp[2] = buffi[4]; tdp += 3; buffi += 8; } } } else { /* if (chan_d == 4) */ if (alp_ind == -1) { tdp--; } if (blend == MLIB_BLEND_GTK_SRC) { mlib_u8 *p_alp = (mlib_u8 *)buffz + 1; mlib_s32 tail = ((mlib_s32 *)tdp)[width]; if (alp_ind != -1) p_alp += 6; for (i = 0; i < width2; i++) { __m128i a0, a1, aa, ss, d0, dd; ss = buffz[i]; a0 = _mm_loadl_epi64((void *)((mlib_d64 *) mlib_m_tbl_255DivAlpha + p_alp[0])); a1 = _mm_loadl_epi64((void *)((mlib_d64 *) mlib_m_tbl_255DivAlpha + p_alp[8])); aa = _mm_unpacklo_epi64(a0, a1); aa = _mm_or_si128(amask256, _mm_andnot_si128(amaskffff, aa)); d0 = _mm_mulhi_epu16(ss, aa); dd = _mm_packus_epi16(d0, d0); _mm_storel_epi64((void *)(tdp + 8 * i), dd); p_alp += 16; } ((mlib_s32 *)tdp)[width] = tail; } else { mlib_blend blend = param->blend; mlib_d64 alp = (param->alpha) * (1.0 / 255); __m128i buff[1]; __m128i done; __m128i ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr; __m128i wi, aa, amask, a16mask, zero_mask_i; __m128 dalp, div255, alpha, fone; __m128 af, sf, w0, w1, w1s, w, rw, w0r, w1r, scale; __m128 zero_mask, f_rnd; mlib_m128 s0u, s1u, s2u, s3u; done = _mm_set1_epi16(1 << 14); amask = _mm_set1_epi32(mask255); a16mask = _mm_set1_epi32(0xFFFF); dalp = _mm_set_ps1(alp * (1.0 / 256)); fone = _mm_set_ps1(1.0); div255 = _mm_set_ps1(1.0 / 255); scale = _mm_set_ps1(1 << 8); alpha = _mm_set_ps1((float)(param->alpha) + 0.5); f_rnd = _mm_set_ps1(0.6); if (blend == MLIB_BLEND_GTK_SRC_OVER2) { if (alp_ind == -1) { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER2, 0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER2, 0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER2, 3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER2, 3); buff[0] = dr; } } } else { if (alp_ind == -1) { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER, 0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER, 0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER, 3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER, 3); buff[0] = dr; } } } for (i = 0; i < (width & 3); i++) { ((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i]; } } } }
int main(int argc, char *argv[]) { struct px *ff_r, *ff_w; struct hdr *hdr_r = NULL, *hdr_w = NULL; unsigned int jobs = 1; int ch; while ((ch = getopt(argc, argv, "j:h")) != -1) { switch (ch) { case 'j': errno = 0; if ((jobs = strtoul(optarg, NULL, 0)) == 0) { if (errno != 0) err(EXIT_FAILURE, "strtoul"); errx(EXIT_FAILURE, "invalid jobs"); } break; case 'h': default: usage(); } } argc -= optind; argv += optind; setshmff(&hdr_r, &ff_r, 0); setshmff(&hdr_w, &ff_w, 1); memmove(hdr_w, hdr_r, sizeof *hdr_r); size_t px_n = hdr_r->width * hdr_r->height; size_t off = 0; int child = fork_jobs(jobs, &off, &px_n); #ifdef SSE __m128i op = _mm_set_epi16(0, UINT16_MAX, UINT16_MAX, UINT16_MAX, 0, UINT16_MAX, UINT16_MAX, UINT16_MAX); __m128i al = _mm_set_epi16(UINT16_MAX, 0, 0, 0, UINT16_MAX, 0, 0, 0); size_t p_len = px_n - off; size_t i = (sizeof(*ff_r) * (p_len)) / sizeof(op); for (; i > 0; i--) { __m128i P = _mm_loadu_si128((__m128i *)&ff_r[off]); __m128i C = _mm_subs_epu16(op, P); C = _mm_adds_epu16(C, al); _mm_storeu_si128((__m128i *)&ff_w[off], C); off += 2; } #endif for (size_t p = off; p < px_n; p++) { /* invert colors */ ff_w[p].red = UINT16_MAX - ff_r[p].red; ff_w[p].green = UINT16_MAX - ff_r[p].green; ff_w[p].blue = UINT16_MAX - ff_r[p].blue; ff_w[p].alpha = ff_r[p].alpha; } return catch_jobs(jobs, child); }
void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred, const uint8_t *pred8, int width, int height, const uint8_t *ref8, const int ref_stride) { const __m128i one = _mm_set1_epi16(1); const int stride = ref_stride << 3; int i, j; uint16_t *pred = CONVERT_TO_SHORTPTR(pred8); uint16_t *ref = CONVERT_TO_SHORTPTR(ref8); if (width >= 8) { // read 8 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 8) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32)); __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40)); __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48)); __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56)); __m128i p0 = _mm_loadu_si128((const __m128i *)pred); __m128i t0, t1, t2, t3; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t2 = _mm_unpacklo_epi16(s4, s5); t3 = _mm_unpacklo_epi16(s6, s7); t0 = _mm_unpacklo_epi32(t0, t1); t2 = _mm_unpacklo_epi32(t2, t3); t0 = _mm_unpacklo_epi64(t0, t2); p0 = _mm_adds_epu16(t0, p0); p0 = _mm_adds_epu16(p0, one); p0 = _mm_srli_epi16(p0, 1); _mm_storeu_si128((__m128i *)(comp_pred), p0); comp_pred += 8; pred += 8; ref += 8 * 8; } ref += stride - (width << 3); } } else { // read 4 points at one time for (i = 0; i < height; i++) { for (j = 0; j < width; j += 4) { __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref); __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8)); __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16)); __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24)); __m128i p0 = _mm_loadl_epi64((const __m128i *)pred); __m128i t0, t1; t0 = _mm_unpacklo_epi16(s0, s1); t1 = _mm_unpacklo_epi16(s2, s3); t0 = _mm_unpacklo_epi32(t0, t1); p0 = _mm_adds_epu16(t0, p0); p0 = _mm_adds_epu16(p0, one); p0 = _mm_srli_epi16(p0, 1); _mm_storel_epi64((__m128i *)(comp_pred), p0); comp_pred += 4; pred += 4; ref += 4 * 8; } ref += stride - (width << 3); } } }