Пример #1
0
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha,
                    int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 8;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound = _mm_set1_epi16(1 << 7);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i v1 = _mm_unpacklo_epi8(v0, zero);
      const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]);
      const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero);
      const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0);
      const __m128i v2 = _mm_mulhi_epu16(v1, alpha2);
      const __m128i v3 = _mm_mullo_epi16(v1, alpha1);
      const __m128i v4 = _mm_adds_epu16(v2, v3);
      const __m128i v5 = _mm_adds_epu16(v4, kRound);
      const __m128i v6 = _mm_srli_epi16(v5, 8);
      const __m128i v7 = _mm_packus_epi16(v6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], v7);
    }
  }
  width -= x;
  if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse);
}
Пример #2
0
static void sum_16(const uint8_t *a, const uint8_t *b, __m128i *sum_0,
                   __m128i *sum_1) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i a_u8 = _mm_loadu_si128((const __m128i *)a);
  const __m128i b_u8 = _mm_loadu_si128((const __m128i *)b);

  const __m128i a_0_u16 = _mm_cvtepu8_epi16(a_u8);
  const __m128i a_1_u16 = _mm_unpackhi_epi8(a_u8, zero);
  const __m128i b_0_u16 = _mm_cvtepu8_epi16(b_u8);
  const __m128i b_1_u16 = _mm_unpackhi_epi8(b_u8, zero);

  const __m128i diff_0_s16 = _mm_sub_epi16(a_0_u16, b_0_u16);
  const __m128i diff_1_s16 = _mm_sub_epi16(a_1_u16, b_1_u16);
  const __m128i diff_sq_0_u16 = _mm_mullo_epi16(diff_0_s16, diff_0_s16);
  const __m128i diff_sq_1_u16 = _mm_mullo_epi16(diff_1_s16, diff_1_s16);

  __m128i shift_left = _mm_slli_si128(diff_sq_0_u16, 2);
  // Use _mm_alignr_epi8() to "shift in" diff_sq_u16[8].
  __m128i shift_right = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 2);

  __m128i sum_u16 = _mm_adds_epu16(diff_sq_0_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum_0 = sum_u16;

  shift_left = _mm_alignr_epi8(diff_sq_1_u16, diff_sq_0_u16, 14);
  shift_right = _mm_srli_si128(diff_sq_1_u16, 2);

  sum_u16 = _mm_adds_epu16(diff_sq_1_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum_1 = sum_u16;
}
Пример #3
0
unsigned int vp9_avg_8x8_sse2(const uint8_t *s, int p) {
  __m128i s0, s1, u0;
  unsigned int avg = 0;
  u0  = _mm_setzero_si128();
  s0 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s)), u0);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 2 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 3 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 4 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 5 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 6 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);
  s1 = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i *)(s + 7 * p)), u0);
  s0 = _mm_adds_epu16(s0, s1);

  s0 = _mm_adds_epu16(s0, _mm_srli_si128(s0, 8));
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 32));
  s0 = _mm_adds_epu16(s0, _mm_srli_epi64(s0, 16));
  avg = _mm_extract_epi16(s0, 0);
  return (avg + 32) >> 6;
}
Пример #4
0
// Load values from 'a' and 'b'. Compute the difference squared and sum
// neighboring values such that:
// sum[1] = (a[0]-b[0])^2 + (a[1]-b[1])^2 + (a[2]-b[2])^2
// Values to the left and right of the row are set to 0.
// The values are returned in sum_0 and sum_1 as *unsigned* 16 bit values.
static void sum_8(const uint8_t *a, const uint8_t *b, __m128i *sum) {
  const __m128i a_u8 = _mm_loadl_epi64((const __m128i *)a);
  const __m128i b_u8 = _mm_loadl_epi64((const __m128i *)b);

  const __m128i a_u16 = _mm_cvtepu8_epi16(a_u8);
  const __m128i b_u16 = _mm_cvtepu8_epi16(b_u8);

  const __m128i diff_s16 = _mm_sub_epi16(a_u16, b_u16);
  const __m128i diff_sq_u16 = _mm_mullo_epi16(diff_s16, diff_s16);

  // Shift all the values one place to the left/right so we can efficiently sum
  // diff_sq_u16[i - 1] + diff_sq_u16[i] + diff_sq_u16[i + 1].
  const __m128i shift_left = _mm_slli_si128(diff_sq_u16, 2);
  const __m128i shift_right = _mm_srli_si128(diff_sq_u16, 2);

  // It becomes necessary to treat the values as unsigned at this point. The
  // 255^2 fits in uint16_t but not int16_t. Use saturating adds from this point
  // forward since the filter is only applied to smooth small pixel changes.
  // Once the value has saturated to uint16_t it is well outside the useful
  // range.
  __m128i sum_u16 = _mm_adds_epu16(diff_sq_u16, shift_left);
  sum_u16 = _mm_adds_epu16(sum_u16, shift_right);

  *sum = sum_u16;
}
Пример #5
0
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) {
  int x = 0;
  if (!inverse) {
    const int kSpan = 2;
    const __m128i zero = _mm_setzero_si128();
    const __m128i kRound =
        _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7);
    const __m128i kMult =
        _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101);
    const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0);
    const int w2 = width & ~(kSpan - 1);
    for (x = 0; x < w2; x += kSpan) {
      const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]);
      const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero);
      const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3));
      const __m128i tmp2 = _mm_srli_epi64(tmp1, 16);
      const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult);
      const __m128i scale1 = _mm_or_si128(tmp2, kOne64);
      const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0);
      const __m128i argb3 = _mm_mullo_epi16(argb1, scale1);
      const __m128i argb4 = _mm_adds_epu16(argb2, argb3);
      const __m128i argb5 = _mm_adds_epu16(argb4, kRound);
      const __m128i argb6 = _mm_srli_epi16(argb5, 8);
      const __m128i argb7 = _mm_packus_epi16(argb6, zero);
      _mm_storel_epi64((__m128i*)&ptr[x], argb7);
    }
  }
  width -= x;
  if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse);
}
Пример #6
0
//! \brief
//! Divide 8 16-bit uints by 255:
//! x := ((x + 1) + (x >> 8)) >> 8:
//! See: http://www.alfredklomp.com/programming/sse-intrinsics/
//!
inline __m128i
_mm_div255_epu16(__m128i x)
{
    return _mm_srli_epi16(_mm_adds_epu16(
        _mm_adds_epu16(x, _mm_set1_epi16(1)),
        _mm_srli_epi16(x, 8)), 8);
}
Пример #7
0
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16,
                       const __m128i mul_constants_0,
                       const __m128i mul_constants_1, const int strength,
                       const int rounding, const int weight) {
  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
  const __m128i weight_u16 = _mm_set1_epi16(weight);
  const __m128i sixteen = _mm_set1_epi16(16);
  __m128i input_0, input_1;

  input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0);
  input_0 = _mm_adds_epu16(input_0, rounding_u16);

  input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1);
  input_1 = _mm_adds_epu16(input_1, rounding_u16);

  input_0 = _mm_srl_epi16(input_0, strength_u128);
  input_1 = _mm_srl_epi16(input_1, strength_u128);

  input_0 = _mm_min_epu16(input_0, sixteen);
  input_1 = _mm_min_epu16(input_1, sixteen);
  input_0 = _mm_sub_epi16(sixteen, input_0);
  input_1 = _mm_sub_epi16(sixteen, input_1);

  *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16);
  *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16);
}
Пример #8
0
static void GF_FUNC_ALIGN VS_CC
proc_9_10_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *d, const uint8_t *s, int th)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *orig = p0, *end = p2;

    line_copy16(p0, srcp + stride, width, 1);
    line_copy16(p1, srcp, width, 1);

    int16_t threshold = (int16_t)th;

    __m128i zero = _mm_setzero_si128();
    __m128i xth  = _mm_set1_epi16(threshold);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 1 ? 1 : -1);
        line_copy16(p2, srcp, width, 1);
        uint16_t *coordinates[] = COORDINATES;

        for (int x = 0; x < width; x += 8) {
            __m128i sum = zero;
            for (int i = 0; i < 8; i++) {
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(coordinates[i] + x));
                sum = _mm_adds_epu16(sum, xmm0);
            }
            sum = _mm_srai_epi16(sum, 3);

            __m128i src = _mm_load_si128((__m128i *)(p1 + x));
            __m128i limit = _mm_adds_epu16(src, xth);
            
            sum = MM_MAX_EPU16(sum, src);
            sum = MM_MIN_EPU16(sum, limit);
            
            _mm_store_si128((__m128i *)(dstp + x), sum);
        }
        
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = (p2 == end) ? orig : p2 + bstride;
    }
}
Пример #9
0
// Add 'sum_u16' to 'count'. Multiply by 'pred' and add to 'accumulator.'
static void accumulate_and_store_8(const __m128i sum_u16, const uint8_t *pred,
                                   uint16_t *count, uint32_t *accumulator) {
  const __m128i pred_u8 = _mm_loadl_epi64((const __m128i *)pred);
  const __m128i zero = _mm_setzero_si128();
  __m128i count_u16 = _mm_loadu_si128((const __m128i *)count);
  __m128i pred_u16 = _mm_cvtepu8_epi16(pred_u8);
  __m128i pred_0_u32, pred_1_u32;
  __m128i accum_0_u32, accum_1_u32;

  count_u16 = _mm_adds_epu16(count_u16, sum_u16);
  _mm_storeu_si128((__m128i *)count, count_u16);

  pred_u16 = _mm_mullo_epi16(sum_u16, pred_u16);

  pred_0_u32 = _mm_cvtepu16_epi32(pred_u16);
  pred_1_u32 = _mm_unpackhi_epi16(pred_u16, zero);

  accum_0_u32 = _mm_loadu_si128((const __m128i *)accumulator);
  accum_1_u32 = _mm_loadu_si128((const __m128i *)(accumulator + 4));

  accum_0_u32 = _mm_add_epi32(pred_0_u32, accum_0_u32);
  accum_1_u32 = _mm_add_epi32(pred_1_u32, accum_1_u32);

  _mm_storeu_si128((__m128i *)accumulator, accum_0_u32);
  _mm_storeu_si128((__m128i *)(accumulator + 4), accum_1_u32);
}
static inline __m128i
max_epu16(__m128i a, __m128i b)
{
  a = _mm_subs_epu16 (a, b);
  b = _mm_adds_epu16 (b, a);
  return b;
}
Пример #11
0
__m128i test_mm_adds_epu16(__m128i A, __m128i B) {
  // DAG-LABEL: test_mm_adds_epu16
  // DAG: call <8 x i16> @llvm.x86.sse2.paddus.w
  //
  // ASM-LABEL: test_mm_adds_epu16
  // ASM: paddusw
  return _mm_adds_epu16(A, B);
}
    SIMDValue SIMDUint16x8Operation::OpAddSaturate(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        x86Result.m128i_value = _mm_adds_epu16(tmpaValue.m128i_value, tmpbValue.m128i_value); // a + b saturated

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Пример #13
0
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0,
	const Uint8* source1, Uint8* dest)
{
	__m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10;
	Uint32 i;

	for (i = 0; i < (size / 4); i++)
	{
		t0 = _mm_load_si128((__m128i*)&source0[i * 16]);
		t1 = _mm_load_si128((__m128i*)&source1[i * 16]);
		t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]);

		t2 = _mm_unpacklo_epi8(t2, t2);
		t2 = _mm_unpacklo_epi16(t2, t2);

		t3 = _mm_unpacklo_epi8(t0, t0);
		t4 = _mm_unpacklo_epi8(t1, t1);

		t5 = _mm_unpacklo_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t9 = _mm_adds_epu16(t7, t8);
		t9 = _mm_srli_epi16(t9, 8);

		t3 = _mm_unpackhi_epi8(t0, t0);
		t4 = _mm_unpackhi_epi8(t1, t1);

		t5 = _mm_unpackhi_epi32(t2, t2);
		t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5);

		t7 = _mm_mulhi_epu16(t3, t6);
		t8 = _mm_mulhi_epu16(t4, t5);

		t10 = _mm_adds_epu16(t7, t8);
		t10 = _mm_srli_epi16(t10, 8);

		t10 = _mm_packus_epi16(t9, t10);

		_mm_stream_si128((__m128i*)&dest[i * 16], t10);
	}
}
Пример #14
0
__m64 _m_paddusw(__m64 _MM1, __m64 _MM2)
{
    __m128i lhs = {0}, rhs = {0};
    lhs.m128i_i64[0] = _MM1.m64_i64;

    rhs.m128i_i64[0] = _MM2.m64_i64;

    lhs = _mm_adds_epu16(lhs, rhs);

    _MM1.m64_i64 = lhs.m128i_i64[0];
    return _MM1;
}
Пример #15
0
// These constants are 14b fixed-point version of ITU-R BT.601 constants.
// R = (19077 * y             + 26149 * v - 14234) >> 6
// G = (19077 * y -  6419 * u - 13320 * v +  8708) >> 6
// B = (19077 * y + 33050 * u             - 17685) >> 6
static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0,
                                     const __m128i* const U0,
                                     const __m128i* const V0,
                                     __m128i* const R,
                                     __m128i* const G,
                                     __m128i* const B) {
  const __m128i k19077 = _mm_set1_epi16(19077);
  const __m128i k26149 = _mm_set1_epi16(26149);
  const __m128i k14234 = _mm_set1_epi16(14234);
  // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic
  const __m128i k33050 = _mm_set1_epi16((short)33050);
  const __m128i k17685 = _mm_set1_epi16(17685);
  const __m128i k6419  = _mm_set1_epi16(6419);
  const __m128i k13320 = _mm_set1_epi16(13320);
  const __m128i k8708  = _mm_set1_epi16(8708);

  const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077);

  const __m128i R0 = _mm_mulhi_epu16(*V0, k26149);
  const __m128i R1 = _mm_sub_epi16(Y1, k14234);
  const __m128i R2 = _mm_add_epi16(R1, R0);

  const __m128i G0 = _mm_mulhi_epu16(*U0, k6419);
  const __m128i G1 = _mm_mulhi_epu16(*V0, k13320);
  const __m128i G2 = _mm_add_epi16(Y1, k8708);
  const __m128i G3 = _mm_add_epi16(G0, G1);
  const __m128i G4 = _mm_sub_epi16(G2, G3);

  // be careful with the saturated *unsigned* arithmetic here!
  const __m128i B0 = _mm_mulhi_epu16(*U0, k33050);
  const __m128i B1 = _mm_adds_epu16(B0, Y1);
  const __m128i B2 = _mm_subs_epu16(B1, k17685);

  // use logical shift for B2, which can be larger than 32767
  *R = _mm_srai_epi16(R2, 6);   // range: [-14234, 30815]
  *G = _mm_srai_epi16(G4, 6);   // range: [-10953, 27710]
  *B = _mm_srli_epi16(B2, 6);   // range: [0, 34238]
}
Пример #16
0
// Average the value based on the number of values summed (9 for pixels away
// from the border, 4 for pixels in corners, and 6 for other edge values).
//
// Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply
// by weight.
static __m128i average_8(__m128i sum, const __m128i mul_constants,
                         const int strength, const int rounding,
                         const int weight) {
  // _mm_srl_epi16 uses the lower 64 bit value for the shift.
  const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength);
  const __m128i rounding_u16 = _mm_set1_epi16(rounding);
  const __m128i weight_u16 = _mm_set1_epi16(weight);
  const __m128i sixteen = _mm_set1_epi16(16);

  // modifier * 3 / index;
  sum = _mm_mulhi_epu16(sum, mul_constants);

  sum = _mm_adds_epu16(sum, rounding_u16);
  sum = _mm_srl_epi16(sum, strength_u128);

  // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4
  // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385
  // So this needs to use the epu16 version which did not come until SSE4.
  sum = _mm_min_epu16(sum, sixteen);

  sum = _mm_sub_epi16(sixteen, sum);

  return _mm_mullo_epi16(sum, weight_u16);
}
Пример #17
0
void vp9_temporal_filter_apply_sse4_1(const uint8_t *a, unsigned int stride,
                                      const uint8_t *b, unsigned int width,
                                      unsigned int height, int strength,
                                      int weight, uint32_t *accumulator,
                                      uint16_t *count) {
  unsigned int h;
  const int rounding = strength > 0 ? 1 << (strength - 1) : 0;

  assert(strength >= 0);
  assert(strength <= 6);

  assert(weight >= 0);
  assert(weight <= 2);

  assert(width == 8 || width == 16);

  if (width == 8) {
    __m128i sum_row_a, sum_row_b, sum_row_c;
    __m128i mul_constants = _mm_setr_epi16(
        NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
        NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

    sum_8(a, b, &sum_row_a);
    sum_8(a + stride, b + width, &sum_row_b);
    sum_row_c = _mm_adds_epu16(sum_row_a, sum_row_b);
    sum_row_c = average_8(sum_row_c, mul_constants, strength, rounding, weight);
    accumulate_and_store_8(sum_row_c, b, count, accumulator);

    a += stride + stride;
    b += width;
    count += width;
    accumulator += width;

    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                   NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);

    for (h = 0; h < height - 2; ++h) {
      sum_8(a, b + width, &sum_row_c);
      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
      sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_c);
      sum_row_a =
          average_8(sum_row_a, mul_constants, strength, rounding, weight);
      accumulate_and_store_8(sum_row_a, b, count, accumulator);

      a += stride;
      b += width;
      count += width;
      accumulator += width;

      sum_row_a = sum_row_b;
      sum_row_b = sum_row_c;
    }

    mul_constants = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                   NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
    sum_row_a = _mm_adds_epu16(sum_row_a, sum_row_b);
    sum_row_a = average_8(sum_row_a, mul_constants, strength, rounding, weight);
    accumulate_and_store_8(sum_row_a, b, count, accumulator);

  } else {  // width == 16
    __m128i sum_row_a_0, sum_row_a_1;
    __m128i sum_row_b_0, sum_row_b_1;
    __m128i sum_row_c_0, sum_row_c_1;
    __m128i mul_constants_0 = _mm_setr_epi16(
                NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6),
            mul_constants_1 = _mm_setr_epi16(
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);

    sum_16(a, b, &sum_row_a_0, &sum_row_a_1);
    sum_16(a + stride, b + width, &sum_row_b_0, &sum_row_b_1);

    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
               strength, rounding, weight);
    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);

    a += stride + stride;
    b += width;
    count += width;
    accumulator += width;

    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9);
    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_9,
                                     NEIGHBOR_CONSTANT_9, NEIGHBOR_CONSTANT_6);
    for (h = 0; h < height - 2; ++h) {
      sum_16(a, b + width, &sum_row_c_0, &sum_row_c_1);

      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
      sum_row_a_0 = _mm_adds_epu16(sum_row_a_0, sum_row_c_0);
      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);
      sum_row_a_1 = _mm_adds_epu16(sum_row_a_1, sum_row_c_1);

      average_16(&sum_row_a_0, &sum_row_a_1, mul_constants_0, mul_constants_1,
                 strength, rounding, weight);
      accumulate_and_store_16(sum_row_a_0, sum_row_a_1, b, count, accumulator);

      a += stride;
      b += width;
      count += width;
      accumulator += width;

      sum_row_a_0 = sum_row_b_0;
      sum_row_a_1 = sum_row_b_1;
      sum_row_b_0 = sum_row_c_0;
      sum_row_b_1 = sum_row_c_1;
    }

    mul_constants_0 = _mm_setr_epi16(NEIGHBOR_CONSTANT_4, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6);
    mul_constants_1 = _mm_setr_epi16(NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_6,
                                     NEIGHBOR_CONSTANT_6, NEIGHBOR_CONSTANT_4);
    sum_row_c_0 = _mm_adds_epu16(sum_row_a_0, sum_row_b_0);
    sum_row_c_1 = _mm_adds_epu16(sum_row_a_1, sum_row_b_1);

    average_16(&sum_row_c_0, &sum_row_c_1, mul_constants_0, mul_constants_1,
               strength, rounding, weight);
    accumulate_and_store_16(sum_row_c_0, sum_row_c_1, b, count, accumulator);
  }
}
int
global_sse2_word(int                  queryLength,
                 unsigned short      *profile,
                 const unsigned char *dbSeq,
                 int                  dbLength,
                 unsigned short       gapOpen,
                 unsigned short       gapExtend,
                 unsigned short       ceiling,
                 struct f_struct     *f_str)
{
  int     i, j;

  int     score;
  int     scale;
  int     temp;
  int     distance;

  int     offset;
  int     position;

  int     cmp;
  int     iter;
    
  __m128i *pvH;
  __m128i *pvE;

  __m128i vE, vF, vH;
  __m128i vHNext;
  __m128i vFPrev;

  __m128i vGapOpen;
  __m128i vGapExtend;
  __m128i vCeiling;

  __m128i vScale;
  __m128i vScaleAmt;
  __m128i vScaleTmp;

  __m128i vTemp;
  __m128i vNull;

  __m128i *pvScore;

  scale = 0;
  iter = (queryLength + 7) / 8;
  offset = (queryLength - 1) % iter;
  position = 7 - (queryLength - 1) / iter;

  pvH = (__m128i *)f_str->workspace;
  pvE = pvH + iter;

  /* Load gap opening penalty to all elements of a constant */
  vGapOpen = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
  vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
  vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);

  /* Load gap extension penalty to all elements of a constant */
  vGapExtend = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
  vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
  vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);

  /* Generate the ceiling before scaling */
  vTemp = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
  vTemp = _mm_shufflelo_epi16 (vTemp, 0);
  vTemp = _mm_shuffle_epi32 (vTemp, 0);
  vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
  vCeiling = _mm_srli_epi16 (vCeiling, 1);
  vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
  vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);

  vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
  vNull = _mm_slli_epi16 (vNull, 15);
  vScaleAmt = _mm_xor_si128 (vNull, vNull);

  /* Zero out the storage vector */
  vTemp = _mm_adds_epi16 (vNull, vGapOpen);
  for (i = 0; i < iter; i++) {
    _mm_store_si128 (pvH + i, vTemp);
    _mm_store_si128 (pvE + i, vNull);
  }

  /* initialize F */
  vF = vNull;
  vFPrev = vNull;

  /* load and scale H for the next round */
  vTemp = _mm_srli_si128 (vGapOpen, 14);
  vH = _mm_load_si128 (pvH + iter - 1);
  vH = _mm_adds_epi16 (vH, vTemp);

  for (i = 0; i < dbLength; ++i) {
    /* fetch first data asap. */
    pvScore = (__m128i *) profile + dbSeq[i] * iter;

    vF = vNull;

    vH = _mm_max_epi16 (vH, vFPrev);
    for (j = 0; j < iter; j++) {
      /* correct H from the previous columns F */
      vHNext = _mm_load_si128 (pvH + j);
      vHNext = _mm_max_epi16 (vHNext, vFPrev);

      /* load and correct E value */
      vE = _mm_load_si128 (pvE + j);
      vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
      vE = _mm_max_epi16 (vE, vTemp);
      _mm_store_si128 (pvE + j, vE);

      /* add score to vH */
      vH = _mm_adds_epi16 (vH, *pvScore++);

      /* get max from vH, vE and vF */
      vH = _mm_max_epi16 (vH, vE);
      vH = _mm_max_epi16 (vH, vF);
      _mm_store_si128 (pvH + j, vH);

      /* update vF value */
      vH = _mm_subs_epi16 (vH, vGapOpen);
      vF = _mm_max_epi16 (vF, vH);

      /* load the next h values */
      vH = vHNext;
    }

    /* check if we need to scale before the next round */
    vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
    cmp  = _mm_movemask_epi8 (vTemp);

    /* broadcast F values */
    vF = _mm_xor_si128 (vF, vNull);

    vTemp  = _mm_slli_si128 (vF, 2);
    vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
    vF = max_epu16 (vF, vTemp);

    vTemp  = _mm_slli_si128 (vF, 4);
    vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 4);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 8);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    /* scale if necessary */
    if (cmp != 0x0000) {
      __m128i vScale1;
      __m128i vScale2;

      vScale = _mm_slli_si128 (vF, 2);
      vScale = _mm_subs_epu16 (vScale, vGapOpen);
      vScale = _mm_subs_epu16 (vScale, vScaleAmt);

      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vScale, vTemp);
      vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vTemp, vScale);
      vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);

      /* rescale the previous F */
      vF = _mm_subs_epu16 (vF, vScale);

      /* check if we can continue in signed 16-bits */
      vTemp = _mm_xor_si128 (vF, vNull);
      vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
      cmp  = _mm_movemask_epi8 (vTemp);
      if (cmp != 0x0000) {
        return OVERFLOW_SCORE;
      }

      vTemp   = _mm_adds_epi16 (vCeiling, vCeiling);
      vScale1 = _mm_subs_epu16 (vScale, vTemp);
      vScale2 = _mm_subs_epu16 (vScale, vScale1);

      /* scale all the vectors */
      for (j = 0; j < iter; j++) {
        /* load H and E */
        vH = _mm_load_si128 (pvH + j);
        vE = _mm_load_si128 (pvE + j);

        /* get max from vH, vE and vF */
        vH = _mm_subs_epi16 (vH, vScale1);
        vH = _mm_subs_epi16 (vH, vScale2);
        vE = _mm_subs_epi16 (vE, vScale1);
        vE = _mm_subs_epi16 (vE, vScale2);

        /* save the H and E */
        _mm_store_si128 (pvH + j, vH);
        _mm_store_si128 (pvE + j, vE);
      }

      vScale = vScaleAmt;
      for (j = 0; j < position; ++j) {
        vScale = _mm_slli_si128 (vScale, 2);
      }

      /* calculate the final scaling amount */
      vTemp   = _mm_xor_si128 (vTemp, vTemp);
      vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
      vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
      vScale  = _mm_add_epi32 (vScale1, vScale2);
      vTemp = _mm_srli_si128 (vScale, 8);
      vScale = _mm_add_epi32 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 4);
      vScale = _mm_add_epi32 (vScale, vTemp);
      scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
      temp  = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
      scale = scale + (temp << 16);
    }

    /* scale the F value for the next round */
    vFPrev = _mm_slli_si128 (vF, 2);
    vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
    vFPrev = _mm_xor_si128 (vFPrev, vNull);

    /* load and scale H for the next round */
    vH = _mm_load_si128 (pvH + iter - 1);
    vH = _mm_xor_si128 (vH, vNull);
    vH = _mm_slli_si128 (vH, 2);
    vH = _mm_subs_epu16 (vH, vScaleAmt);
    vH = _mm_insert_epi16 (vH, gapOpen, 0);
    vH = _mm_xor_si128 (vH, vNull);
  }

  vH = _mm_load_si128 (pvH + offset);
  vH = _mm_max_epi16 (vH, vFPrev);
  for (j = 0; j < position; ++j) {
    vH = _mm_slli_si128 (vH, 2);
  }
  score = (int) (signed short) _mm_extract_epi16 (vH, 7);
  score = score + SHORT_BIAS;

  /* return largest score */
  distance = (queryLength + dbLength) * gapExtend;
  score = score - (gapOpen * 2) - distance + scale;

  return score;
}
Пример #19
0
void av1_highbd_jnt_convolve_2d_copy_sse4_1(
    const uint16_t *src, int src_stride, uint16_t *dst0, int dst_stride0, int w,
    int h, const InterpFilterParams *filter_params_x,
    const InterpFilterParams *filter_params_y, const int subpel_x_q4,
    const int subpel_y_q4, ConvolveParams *conv_params, int bd) {
  CONV_BUF_TYPE *dst = conv_params->dst;
  int dst_stride = conv_params->dst_stride;
  (void)filter_params_x;
  (void)filter_params_y;
  (void)subpel_x_q4;
  (void)subpel_y_q4;

  const int bits =
      FILTER_BITS * 2 - conv_params->round_1 - conv_params->round_0;
  const __m128i left_shift = _mm_cvtsi32_si128(bits);
  const int do_average = conv_params->do_average;
  const int use_jnt_comp_avg = conv_params->use_jnt_comp_avg;
  const int w0 = conv_params->fwd_offset;
  const int w1 = conv_params->bck_offset;
  const __m128i wt0 = _mm_set1_epi32(w0);
  const __m128i wt1 = _mm_set1_epi32(w1);
  const __m128i zero = _mm_setzero_si128();
  int i, j;

  const int offset_0 =
      bd + 2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  const int offset = (1 << offset_0) + (1 << (offset_0 - 1));
  const __m128i offset_const = _mm_set1_epi32(offset);
  const __m128i offset_const_16b = _mm_set1_epi16(offset);
  const int rounding_shift =
      2 * FILTER_BITS - conv_params->round_0 - conv_params->round_1;
  const __m128i rounding_const = _mm_set1_epi32((1 << rounding_shift) >> 1);
  const __m128i clip_pixel_to_bd =
      _mm_set1_epi16(bd == 10 ? 1023 : (bd == 12 ? 4095 : 255));

  assert(bits <= 4);

  if (!(w % 8)) {
    for (i = 0; i < h; i += 1) {
      for (j = 0; j < w; j += 8) {
        const __m128i src_16bit =
            _mm_loadu_si128((__m128i *)(&src[i * src_stride + j]));
        const __m128i res = _mm_sll_epi16(src_16bit, left_shift);
        if (do_average) {
          const __m128i data_0 =
              _mm_loadu_si128((__m128i *)(&dst[i * dst_stride + j]));

          const __m128i data_ref_0_lo = _mm_unpacklo_epi16(data_0, zero);
          const __m128i data_ref_0_hi = _mm_unpackhi_epi16(data_0, zero);

          const __m128i res_32b_lo = _mm_unpacklo_epi16(res, zero);
          const __m128i res_unsigned_lo =
              _mm_add_epi32(res_32b_lo, offset_const);

          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
              &data_ref_0_lo, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);

          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
          const __m128i res_unsigned_hi =
              _mm_add_epi32(res_32b_hi, offset_const);

          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
              &data_ref_0_hi, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);

          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);

          const __m128i res_16b =
              _mm_packus_epi32(round_result_lo, round_result_hi);
          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);

          _mm_store_si128((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
        } else {
          const __m128i res_unsigned_16b =
              _mm_adds_epu16(res, offset_const_16b);

          _mm_store_si128((__m128i *)(&dst[i * dst_stride + j]),
                          res_unsigned_16b);
        }
      }
    }
  } else if (!(w % 4)) {
    for (i = 0; i < h; i += 2) {
      for (j = 0; j < w; j += 4) {
        const __m128i src_row_0 =
            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j]));
        const __m128i src_row_1 =
            _mm_loadl_epi64((__m128i *)(&src[i * src_stride + j + src_stride]));
        const __m128i src_10 = _mm_unpacklo_epi64(src_row_0, src_row_1);

        const __m128i res = _mm_sll_epi16(src_10, left_shift);

        if (do_average) {
          const __m128i data_0 =
              _mm_loadl_epi64((__m128i *)(&dst[i * dst_stride + j]));
          const __m128i data_1 = _mm_loadl_epi64(
              (__m128i *)(&dst[i * dst_stride + j + dst_stride]));

          const __m128i data_ref_0 = _mm_unpacklo_epi16(data_0, zero);
          const __m128i data_ref_1 = _mm_unpacklo_epi16(data_1, zero);

          const __m128i res_32b = _mm_unpacklo_epi16(res, zero);
          const __m128i res_unsigned_lo = _mm_add_epi32(res_32b, offset_const);

          const __m128i res_32b_hi = _mm_unpackhi_epi16(res, zero);
          const __m128i res_unsigned_hi =
              _mm_add_epi32(res_32b_hi, offset_const);

          const __m128i comp_avg_res_lo = highbd_comp_avg_sse4_1(
              &data_ref_0, &res_unsigned_lo, &wt0, &wt1, use_jnt_comp_avg);
          const __m128i comp_avg_res_hi = highbd_comp_avg_sse4_1(
              &data_ref_1, &res_unsigned_hi, &wt0, &wt1, use_jnt_comp_avg);

          const __m128i round_result_lo = highbd_convolve_rounding_sse2(
              &comp_avg_res_lo, &offset_const, &rounding_const, rounding_shift);
          const __m128i round_result_hi = highbd_convolve_rounding_sse2(
              &comp_avg_res_hi, &offset_const, &rounding_const, rounding_shift);

          const __m128i res_16b =
              _mm_packus_epi32(round_result_lo, round_result_hi);
          const __m128i res_clip = _mm_min_epi16(res_16b, clip_pixel_to_bd);

          const __m128i res_1 = _mm_srli_si128(res_clip, 8);

          _mm_storel_epi64((__m128i *)(&dst0[i * dst_stride0 + j]), res_clip);
          _mm_storel_epi64(
              (__m128i *)(&dst0[i * dst_stride0 + j + dst_stride0]), res_1);
        } else {
          const __m128i res_unsigned_16b =
              _mm_adds_epu16(res, offset_const_16b);

          const __m128i res_1 = _mm_srli_si128(res_unsigned_16b, 8);

          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j]),
                           res_unsigned_16b);
          _mm_storel_epi64((__m128i *)(&dst[i * dst_stride + j + dst_stride]),
                           res_1);
        }
      }
    }
  }
}
Пример #20
0
static FORCE_INLINE __m128i mm_adds_epu(const __m128i &a, const __m128i &b) {
    if (sizeof(PixelType) == 1)
        return _mm_adds_epu8(a, b);
    else
        return _mm_adds_epu16(a, b);
}
Пример #21
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(uint8_t *buff, int bstride, int width, int height, int stride,
               uint8_t *dstp, const uint8_t *srcp, edge_t *eh,
               uint16_t plane_max)
{
    uint8_t* p0 = buff + 16;
    uint8_t* p1 = p0 + bstride;
    uint8_t* p2 = p1 + bstride;
    uint8_t* p3 = p2 + bstride;
    uint8_t* p4 = p3 + bstride;
    uint8_t* orig = p0;
    uint8_t* end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    uint8_t th_min = eh->min > 0xFF ? 0xFF : (uint8_t)eh->min;
    uint8_t th_max = eh->max > 0xFF ? 0xFF : (uint8_t)eh->max;

    __m128i zero = _mm_setzero_si128();
    __m128i ab = _mm_set1_epi16(15);
    __m128i max = _mm_set1_epi8((int8_t)th_max);
    __m128i min = _mm_set1_epi8((int8_t)th_min);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t* posh[] = {p2 - 2, p2 - 1, p2 + 1, p2 + 2};
        uint8_t* posv[] = {p0, p1, p3, p4};

        for (int x = 0; x < width; x += 16) {
            __m128i sumx[2] = {zero, zero};
            __m128i sumy[2] = {zero, zero};

            for (int i = 0; i < 4; i++) {
                __m128i xmm0, xmm1, xmul;
                xmul = _mm_load_si128((__m128i *)ar_mulx[i]);
                xmm0 = _mm_loadu_si128((__m128i *)(posh[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumx[0] = _mm_add_epi16(sumx[0], _mm_mullo_epi16(xmm0, xmul));
                sumx[1] = _mm_add_epi16(sumx[1], _mm_mullo_epi16(xmm1, xmul));

                xmul = _mm_load_si128((__m128i *)ar_muly[i]);
                xmm0 = _mm_load_si128((__m128i *)(posv[i] + x));
                xmm1 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                sumy[0] = _mm_add_epi16(sumy[0], _mm_mullo_epi16(xmm0, xmul));
                sumy[1] = _mm_add_epi16(sumy[1], _mm_mullo_epi16(xmm1, xmul));
            }

            for (int i = 0; i < 2; i++) {
                __m128i xmax, xmin, mull, mulh;
                sumx[i] = mm_abs_epi16(sumx[i]);
                sumy[i] = mm_abs_epi16(sumy[i]);
                xmax = _mm_max_epi16(sumx[i], sumy[i]);
                xmin = _mm_min_epi16(sumx[i], sumy[i]);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmax, zero)), 4);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmax, zero)), 4);
                xmax = mm_cast_epi32(mull, mulh);

                mull = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpacklo_epi16(xmin, zero)), 5);
                mulh = _mm_srli_epi32(_mm_madd_epi16(ab, _mm_unpackhi_epi16(xmin, zero)), 5);
                xmin = mm_cast_epi32(mull, mulh);

                sumx[i] = _mm_adds_epu16(xmax, xmin);
                sumx[i] = _mm_srli_epi16(sumx[i], eh->rshift);
            }

            __m128i out = _mm_packus_epi16(sumx[0], sumx[1]);
            __m128i temp = _mm_min_epu8(out, max);
            temp = _mm_cmpeq_epi8(temp, max);
            out = _mm_or_si128(temp, out);

            temp = _mm_max_epu8(out, min);
            temp = _mm_cmpeq_epi8(temp, min);
            out = _mm_andnot_si128(temp, out);

            _mm_store_si128((__m128i*)(dstp + x), out);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
void
mlib_s_ImageBlendLine(
    mlib_work_image * param,
    mlib_u8 *dp,
    __m128i * buffz,
    __m128i * buffd)
{
	mlib_blend blend = param->blend;
	mlib_s32 chan_d = param->chan_d;
	mlib_s32 chan_s = param->channels;
	mlib_d64 alp = (param->alpha) * (1.0 / 255);
	mlib_s32 width = GetElemSubStruct(current, width);
	mlib_u8 *tdp = dp;
	mlib_s32 width2, y_step, next_step = 2;
	mlib_s32 alp_ind = param->alp_ind, mask255;
	__m128i aa, dalp, done;
	__m128i mzero, mask_7fff, mask_8000, amask, amask256, amaskffff;
	__m128i d_rnd;
	mlib_s32 i, j;

	if (!alp_ind) {
		d_rnd = _mm_set1_epi16(0x0080);

		tdp = (void *)dp;
		if (chan_d == 3)
			tdp = (void *)buffd;

		for (i = 0; i < width / 2; i++) {
			__m128i dd;

			dd = buffz[i];
			dd = _mm_adds_epu16(dd, d_rnd);
			dd = _mm_srli_epi16(dd, 8);
			dd = _mm_packus_epi16(dd, dd);
			_mm_storel_epi64((void *)(tdp + 8 * i), dd);
		}
		if (width & 1) {
			__m128i dd;

			dd = buffz[i];
			dd = _mm_adds_epu16(dd, d_rnd);
			dd = _mm_srli_epi16(dd, 8);
			dd = _mm_packus_epi16(dd, dd);
			*(mlib_s32 *)(tdp + 8 * i) = *(mlib_s32 *)&dd;
		}

		if (chan_d == 3) {
			mlib_s_ImageChannelExtract_U8_43L_D1((void *)buffd, dp,
			    width);
		}
		return;
	}

	width2 = (width + 1) / 2;

	mzero = _mm_setzero_si128();
	mask_7fff = _mm_set1_epi16(0x7FFF);
	mask_8000 = _mm_set1_epi16(0x8000);
	done = _mm_set1_epi16(1 << 15);
	if (alp_ind == -1) {
		mask255 = 0xFF;
		amask = _mm_setr_epi32(0xff00, 0, 0xff00, 0);
		amaskffff = _mm_setr_epi32(0xffff, 0, 0xffff, 0);
		amask256 = _mm_setr_epi32(0x0100, 0, 0x0100, 0);
	} else {
		mask255 = 0xFF000000;
		amask = _mm_setr_epi32(0, 0xff000000, 0, 0xff000000);
		amaskffff = _mm_setr_epi32(0, 0xffff0000, 0, 0xffff0000);
		amask256 = _mm_setr_epi32(0, 0x01000000, 0, 0x01000000);
	}
	dalp = _mm_set1_epi16((1 << 15) * alp + 0.5);

	if (chan_s == 3) {
		if (chan_d == 3) {
			mlib_d64 alp = (param->alpha) * (1.0 / 255);
			mlib_s32 ialp;
			mlib_u8 *pz;
			__m128i emask;
			__m128i dalp, ralp, ss, dd, s0, s1, d0, d1, dr;

			mlib_s_ImageChannelExtract_S16_43L_D1((void *)buffz,
			    (void *)buffd, width);

			ialp = alp * (1 << 15);
			dalp = _mm_set1_epi16(ialp);
			ralp = _mm_set1_epi16((1 << 15) - ialp);
			emask = mlib_emask_m128i[(3 * width) & 15].m128i;

			pz = (void *)buffd;
			tdp = dp;
			for (i = 0; i <= 3 * width - 16; i += 16) {
				s0 = _mm_load_si128((__m128i *) (pz + 2 * i));
				s1 = _mm_load_si128((__m128i *) (pz + 2 * i +
				    16));
				dd = _mm_loadu_si128((__m128i *) (tdp + i));
				d0 = _mm_unpacklo_epi8(mzero, dd);
				d1 = _mm_unpackhi_epi8(mzero, dd);
				d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp),
				    _mm_mulhi_epu16(d0, ralp));
				d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp),
				    _mm_mulhi_epu16(d1, ralp));
				d0 = _mm_srli_epi16(d0, 7);
				d1 = _mm_srli_epi16(d1, 7);
				dr = _mm_packus_epi16(d0, d1);
				_mm_storeu_si128((__m128i *) (tdp + i), dr);
			}

			if (i < 3 * width) {
				s0 = _mm_load_si128((__m128i *) (pz + 2 * i));
				s1 = _mm_load_si128((__m128i *) (pz + 2 * i +
				    16));
				dd = _mm_loadu_si128((__m128i *) (tdp + i));
				d0 = _mm_unpacklo_epi8(mzero, dd);
				d1 = _mm_unpackhi_epi8(mzero, dd);
				d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp),
				    _mm_mulhi_epu16(d0, ralp));
				d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp),
				    _mm_mulhi_epu16(d1, ralp));
				d0 = _mm_srli_epi16(d0, 7);
				d1 = _mm_srli_epi16(d1, 7);
				dr = _mm_packus_epi16(d0, d1);

				dr = _mm_or_si128(_mm_and_si128(emask, dr),
				    _mm_andnot_si128(emask, dd));

				_mm_storeu_si128((__m128i *) (tdp + i), dr);
			}
		} else if (blend == MLIB_BLEND_GTK_SRC) {
			mlib_u8 *buffi = (mlib_u8 *)buffz + 1;

			for (i = 0; i < width; i++) {
				tdp[0] = buffi[0];
				tdp[1] = buffi[2];
				tdp[2] = buffi[4];
				tdp[alp_ind] = 255;
				tdp += 4;
				buffi += 8;
			}
		} else {
			mlib_d64 _w0 = param->alpha;
			mlib_d64 _w1s = 1.0 - _w0 * (1.0 / 255);
			__m128i buff[1];
			__m128i done;
			__m128i dalp, ralp, ss, dd, s0, s1, d0, d1, a0, a1, r0,
			    r1, rr, dr;
			__m128i wi, aa, amask;
			__m128 af, w0, w1, w1s, w, rw, w0r, w1r, scale;

			done = _mm_set1_epi16(1 << 15);
			amask = _mm_set1_epi32(mask255);

			w0 = _mm_set_ps1(_w0);
			w1s = _mm_set_ps1(_w1s);
			scale = _mm_set_ps1(1 << 15);

			if (alp_ind == -1) {
				tdp--;
				for (i = 0; i < width / 4; i++) {
					BLEND34_SRC_OVER(0);
					_mm_storeu_si128((__m128i *) tdp, dr);
					tdp += 16;
				}
				if (width & 3) {
					BLEND34_SRC_OVER(0);
					buff[0] = dr;
				}
			} else {
				for (i = 0; i < width / 4; i++) {
					BLEND34_SRC_OVER(3);
					_mm_storeu_si128((__m128i *) tdp, dr);
					tdp += 16;
				}
				if (width & 3) {
					BLEND34_SRC_OVER(3);
					buff[0] = dr;
				}
			}
			for (i = 0; i < (width & 3); i++) {
				((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i];
			}
		}
	} else if (chan_d == 3) {
		if (blend != MLIB_BLEND_GTK_SRC) {
			if (alp_ind == -1) {
				tdp--;
			}
			for (i = 0; i < width; i++) {
				((mlib_s32 *)buffd)[i] =
				    *(mlib_s32 *)(tdp + 3 * i);
			}

			if (alp_ind == -1) {
				for (i = 0; i < width2; i++) {
					__m128i a0, s0, d0, dd;

					BLEND43_SRC_OVER(0);
				}
				mlib_s_ImageChannelExtract_U8_43R_D1((void *)
				    buffd, dp, width);
			} else {
				for (i = 0; i < width2; i++) {
					__m128i a0, s0, d0, dd;

					BLEND43_SRC_OVER(0xff);
				}
				mlib_s_ImageChannelExtract_U8_43L_D1((void *)
				    buffd, dp, width);
			}
		} else {
			mlib_u8 *buffi = (mlib_u8 *)buffz + 1;

			if (alp_ind == -1)
				buffi += 2;
			for (i = 0; i < width; i++) {
				tdp[0] = buffi[0];
				tdp[1] = buffi[2];
				tdp[2] = buffi[4];
				tdp += 3;
				buffi += 8;
			}
		}
	} else {	/* if (chan_d == 4) */

		if (alp_ind == -1) {
			tdp--;
		}
		if (blend == MLIB_BLEND_GTK_SRC) {
			mlib_u8 *p_alp = (mlib_u8 *)buffz + 1;
			mlib_s32 tail = ((mlib_s32 *)tdp)[width];

			if (alp_ind != -1)
				p_alp += 6;
			for (i = 0; i < width2; i++) {
				__m128i a0, a1, aa, ss, d0, dd;

				ss = buffz[i];
				a0 = _mm_loadl_epi64((void *)((mlib_d64 *)
				    mlib_m_tbl_255DivAlpha + p_alp[0]));
				a1 = _mm_loadl_epi64((void *)((mlib_d64 *)
				    mlib_m_tbl_255DivAlpha + p_alp[8]));
				aa = _mm_unpacklo_epi64(a0, a1);
				aa = _mm_or_si128(amask256,
				    _mm_andnot_si128(amaskffff, aa));
				d0 = _mm_mulhi_epu16(ss, aa);
				dd = _mm_packus_epi16(d0, d0);
				_mm_storel_epi64((void *)(tdp + 8 * i), dd);
				p_alp += 16;
			}

			((mlib_s32 *)tdp)[width] = tail;
		} else {
			mlib_blend blend = param->blend;
			mlib_d64 alp = (param->alpha) * (1.0 / 255);
			__m128i buff[1];
			__m128i done;
			__m128i ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr;
			__m128i wi, aa, amask, a16mask, zero_mask_i;
			__m128 dalp, div255, alpha, fone;
			__m128 af, sf, w0, w1, w1s, w, rw, w0r, w1r, scale;
			__m128 zero_mask, f_rnd;
			mlib_m128 s0u, s1u, s2u, s3u;

			done = _mm_set1_epi16(1 << 14);
			amask = _mm_set1_epi32(mask255);
			a16mask = _mm_set1_epi32(0xFFFF);

			dalp = _mm_set_ps1(alp * (1.0 / 256));
			fone = _mm_set_ps1(1.0);
			div255 = _mm_set_ps1(1.0 / 255);
			scale = _mm_set_ps1(1 << 8);
			alpha = _mm_set_ps1((float)(param->alpha) + 0.5);
			f_rnd = _mm_set_ps1(0.6);

			if (blend == MLIB_BLEND_GTK_SRC_OVER2) {
				if (alp_ind == -1) {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER2, 0);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER2, 0);
						buff[0] = dr;
					}
				} else {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER2, 3);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER2, 3);
						buff[0] = dr;
					}
				}
			} else {
				if (alp_ind == -1) {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER, 0);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER, 0);
						buff[0] = dr;
					}
				} else {
					for (i = 0; i < width / 4; i++) {
						BLEND44(SRC_OVER, 3);
						_mm_storeu_si128((__m128i *)
						    tdp, dr);
						tdp += 16;
					}
					if (width & 3) {
						BLEND44(SRC_OVER, 3);
						buff[0] = dr;
					}
				}
			}

			for (i = 0; i < (width & 3); i++) {
				((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i];
			}
		}
	}
}
Пример #23
0
int
main(int argc, char *argv[])
{
	struct px *ff_r, *ff_w;
	struct hdr *hdr_r = NULL, *hdr_w = NULL;
	unsigned int jobs = 1;
	int ch;

	while ((ch = getopt(argc, argv, "j:h")) != -1) {
		switch (ch) {
		case 'j':
			errno = 0;
			if ((jobs = strtoul(optarg, NULL, 0)) == 0) {
				if (errno != 0)
					err(EXIT_FAILURE, "strtoul");
				errx(EXIT_FAILURE, "invalid jobs");
			}
			break;
		case 'h':
		default:
			usage();
		}
	}
	argc -= optind;
	argv += optind;

	setshmff(&hdr_r, &ff_r, 0);
	setshmff(&hdr_w, &ff_w, 1);
	memmove(hdr_w, hdr_r, sizeof *hdr_r);

	size_t px_n = hdr_r->width * hdr_r->height;
	size_t off = 0;

	int child = fork_jobs(jobs, &off, &px_n);

#ifdef SSE
	__m128i op = _mm_set_epi16(0, UINT16_MAX, UINT16_MAX, UINT16_MAX,
	                           0, UINT16_MAX, UINT16_MAX, UINT16_MAX);

	__m128i al = _mm_set_epi16(UINT16_MAX, 0, 0, 0, UINT16_MAX, 0, 0, 0);

	size_t p_len = px_n - off;
	size_t i = (sizeof(*ff_r) * (p_len)) / sizeof(op);

	for (; i > 0; i--) {
		__m128i P = _mm_loadu_si128((__m128i *)&ff_r[off]);
		__m128i C = _mm_subs_epu16(op, P);
		C = _mm_adds_epu16(C, al);
		_mm_storeu_si128((__m128i *)&ff_w[off], C);
		off += 2;
	}
#endif

	for (size_t p = off; p < px_n; p++) {
		/* invert colors */
		ff_w[p].red   = UINT16_MAX - ff_r[p].red;
		ff_w[p].green = UINT16_MAX - ff_r[p].green;
		ff_w[p].blue  = UINT16_MAX - ff_r[p].blue;
		ff_w[p].alpha = ff_r[p].alpha;
	}

	return catch_jobs(jobs, child);
}
Пример #24
0
void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
                                             const uint8_t *pred8, int width,
                                             int height, const uint8_t *ref8,
                                             const int ref_stride) {
  const __m128i one = _mm_set1_epi16(1);
  const int stride = ref_stride << 3;
  int i, j;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

  if (width >= 8) {
    // read 8 points at one time
    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j += 8) {
        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
        __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
        __m128i t0, t1, t2, t3;

        t0 = _mm_unpacklo_epi16(s0, s1);
        t1 = _mm_unpacklo_epi16(s2, s3);
        t2 = _mm_unpacklo_epi16(s4, s5);
        t3 = _mm_unpacklo_epi16(s6, s7);
        t0 = _mm_unpacklo_epi32(t0, t1);
        t2 = _mm_unpacklo_epi32(t2, t3);
        t0 = _mm_unpacklo_epi64(t0, t2);

        p0 = _mm_adds_epu16(t0, p0);
        p0 = _mm_adds_epu16(p0, one);
        p0 = _mm_srli_epi16(p0, 1);

        _mm_storeu_si128((__m128i *)(comp_pred), p0);
        comp_pred += 8;
        pred += 8;
        ref += 8 * 8;
      }
      ref += stride - (width << 3);
    }
  } else {
    // read 4 points at one time
    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j += 4) {
        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
        __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
        __m128i t0, t1;

        t0 = _mm_unpacklo_epi16(s0, s1);
        t1 = _mm_unpacklo_epi16(s2, s3);
        t0 = _mm_unpacklo_epi32(t0, t1);

        p0 = _mm_adds_epu16(t0, p0);
        p0 = _mm_adds_epu16(p0, one);
        p0 = _mm_srli_epi16(p0, 1);

        _mm_storel_epi64((__m128i *)(comp_pred), p0);
        comp_pred += 4;
        pred += 4;
        ref += 4 * 8;
      }
      ref += stride - (width << 3);
    }
  }
}