示例#1
0
static WEBP_INLINE uint32_t Average4(uint32_t a0, uint32_t a1,
                                     uint32_t a2, uint32_t a3) {
  const __m128i avg1 = Average2_128i(a0, a1);
  const __m128i avg2 = Average2_128i(a2, a3);
  const __m128i sum = _mm_add_epi16(avg2, avg1);
  const __m128i avg3 = _mm_srli_epi16(sum, 1);
  const __m128i A0 = _mm_packus_epi16(avg3, avg3);
  const uint32_t output = _mm_cvtsi128_si32(A0);
  return output;
}
示例#2
0
static WEBP_INLINE uint32_t Average3(uint32_t a0, uint32_t a1, uint32_t a2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i avg1 = Average2_128i(a0, a2);
  const __m128i A1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(a1), zero);
  const __m128i sum = _mm_add_epi16(avg1, A1);
  const __m128i avg2 = _mm_srli_epi16(sum, 1);
  const __m128i A2 = _mm_packus_epi16(avg2, avg2);
  const uint32_t output = _mm_cvtsi128_si32(A2);
  return output;
}
static INLINE unsigned int masked_sad8xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    const __m128i src = _mm_unpacklo_epi64(
        _mm_loadl_epi64((const __m128i *)src_ptr),
        _mm_loadl_epi64((const __m128i *)&src_ptr[src_stride]));
    const __m128i a0 = _mm_loadl_epi64((const __m128i *)a_ptr);
    const __m128i a1 = _mm_loadl_epi64((const __m128i *)&a_ptr[a_stride]);
    const __m128i b0 = _mm_loadl_epi64((const __m128i *)b_ptr);
    const __m128i b1 = _mm_loadl_epi64((const __m128i *)&b_ptr[b_stride]);
    const __m128i m =
        _mm_unpacklo_epi64(_mm_loadl_epi64((const __m128i *)m_ptr),
                           _mm_loadl_epi64((const __m128i *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data_l = _mm_unpacklo_epi8(a0, b0);
    const __m128i mask_l = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_l = _mm_maddubs_epi16(data_l, mask_l);
    pred_l = xx_roundn_epu16(pred_l, AOM_BLEND_A64_ROUND_BITS);

    const __m128i data_r = _mm_unpacklo_epi8(a1, b1);
    const __m128i mask_r = _mm_unpackhi_epi8(m, m_inv);
    __m128i pred_r = _mm_maddubs_epi16(data_r, mask_r);
    pred_r = xx_roundn_epu16(pred_r, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_l, pred_r);
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  int32_t sad =
      _mm_cvtsi128_si32(res) + _mm_cvtsi128_si32(_mm_srli_si128(res, 8));
  return (sad + 31) >> 6;
}
示例#4
0
void
png_read_filter_row_paeth3_sse(png_row_infop row_info, png_bytep row,
   png_const_bytep prev_row)
{
   png_size_t i;
   png_bytep rp = row;
   png_const_bytep prp = prev_row;
   __m128i npix = _mm_cvtsi32_si128(*(uint32_t*)rp);
   __m128i ppix = _mm_setzero_si128();           // Same as 'a' in C version.
   __m128i prppix = _mm_setzero_si128();         // Same as 'c' in C version.
   const __m128i zero = _mm_setzero_si128();

   for (i = 0; i < row_info->rowbytes; i += 3, rp += 3, prp += 3)
   {
      __m128i prpix = _mm_cvtsi32_si128(*(uint32_t*)prp);  // Same as 'b' in C ver.
      __m128i pix, pa, pb, pc, temp;

      prpix = _mm_unpacklo_epi8(prpix, zero);
      temp = _mm_sub_epi16(prpix, prppix);  // p = b - c
      pc = _mm_sub_epi16(ppix, prppix);     // pc = a - c

#ifndef __SSSE3__
      pa = _mm_max_epi16(temp, _mm_sub_epi16(prppix, prpix));
      pb = _mm_max_epi16(pc, _mm_sub_epi16(prppix, ppix));
      temp = _mm_add_epi16(temp, pc);
      pc = _mm_max_epi16(temp, _mm_sub_epi16(zero, temp));
#else
      pa = _mm_abs_epi16(temp);             // pa = abs(p)
      pb = _mm_abs_epi16(pc);               // pb = abs(pc)
      temp = _mm_add_epi16(temp, pc);
      pc = _mm_abs_epi16(temp);             // pc = abs(p + pc)
#endif

      temp = _mm_cmplt_epi16(pb, pa);       // if (pb < pa) pa = pb, a = b
      pa = _mm_andnot_si128(temp, pa);
      pa = _mm_or_si128(pa, _mm_and_si128(temp, pb));
      ppix = _mm_andnot_si128(temp, ppix);
      ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prpix));

      pix = npix;
      npix = _mm_cvtsi32_si128(*(uint32_t*)(rp + 3));
      temp = _mm_cmplt_epi16(pc, pa);       // if (pc < pa) a = c
      ppix = _mm_andnot_si128(temp, ppix);
      ppix = _mm_or_si128(ppix, _mm_and_si128(temp, prppix));

      pix = _mm_unpacklo_epi8(pix, zero);
      prppix = prpix;
      ppix = _mm_add_epi16(ppix, pix);

      ppix = _mm_slli_epi16(ppix, 8);
      ppix = _mm_srli_epi16(ppix, 8);
      pix = _mm_packus_epi16(ppix, zero);
      *(uint32_t*)rp = _mm_cvtsi128_si32(pix);
   }
}
示例#5
0
文件: dec_sse2.c 项目: 8l/insieme
static void TransformAC3(const int16_t* in, uint8_t* dst) {
    static const int kC1 = 20091 + (1 << 16);
    static const int kC2 = 35468;
    const __m128i A = _mm_set1_epi16(in[0] + 4);
    const __m128i c4 = _mm_set1_epi16(MUL(in[4], kC2));
    const __m128i d4 = _mm_set1_epi16(MUL(in[4], kC1));
    const int c1 = MUL(in[1], kC2);
    const int d1 = MUL(in[1], kC1);
    const __m128i CD = _mm_set_epi16(0, 0, 0, 0, -d1, -c1, c1, d1);
    const __m128i B = _mm_adds_epi16(A, CD);
    const __m128i m0 = _mm_adds_epi16(B, d4);
    const __m128i m1 = _mm_adds_epi16(B, c4);
    const __m128i m2 = _mm_subs_epi16(B, c4);
    const __m128i m3 = _mm_subs_epi16(B, d4);
    const __m128i zero = _mm_setzero_si128();
    // Load the source pixels.
    __m128i dst0 = _mm_cvtsi32_si128(*(int*)(dst + 0 * BPS));
    __m128i dst1 = _mm_cvtsi32_si128(*(int*)(dst + 1 * BPS));
    __m128i dst2 = _mm_cvtsi32_si128(*(int*)(dst + 2 * BPS));
    __m128i dst3 = _mm_cvtsi32_si128(*(int*)(dst + 3 * BPS));
    // Convert to 16b.
    dst0 = _mm_unpacklo_epi8(dst0, zero);
    dst1 = _mm_unpacklo_epi8(dst1, zero);
    dst2 = _mm_unpacklo_epi8(dst2, zero);
    dst3 = _mm_unpacklo_epi8(dst3, zero);
    // Add the inverse transform.
    dst0 = _mm_adds_epi16(dst0, _mm_srai_epi16(m0, 3));
    dst1 = _mm_adds_epi16(dst1, _mm_srai_epi16(m1, 3));
    dst2 = _mm_adds_epi16(dst2, _mm_srai_epi16(m2, 3));
    dst3 = _mm_adds_epi16(dst3, _mm_srai_epi16(m3, 3));
    // Unsigned saturate to 8b.
    dst0 = _mm_packus_epi16(dst0, dst0);
    dst1 = _mm_packus_epi16(dst1, dst1);
    dst2 = _mm_packus_epi16(dst2, dst2);
    dst3 = _mm_packus_epi16(dst3, dst3);
    // Store the results.
    *(int*)(dst + 0 * BPS) = _mm_cvtsi128_si32(dst0);
    *(int*)(dst + 1 * BPS) = _mm_cvtsi128_si32(dst1);
    *(int*)(dst + 2 * BPS) = _mm_cvtsi128_si32(dst2);
    *(int*)(dst + 3 * BPS) = _mm_cvtsi128_si32(dst3);
}
示例#6
0
opus_val32 celt_inner_prod_sse2(const opus_val16 *x, const opus_val16 *y,
      int N)
{
    opus_int  i, dataSize16;
    opus_int32 sum;

    __m128i inVec1_76543210, inVec1_FEDCBA98, acc1;
    __m128i inVec2_76543210, inVec2_FEDCBA98, acc2;

    sum = 0;
    dataSize16 = N & ~15;

    acc1 = _mm_setzero_si128();
    acc2 = _mm_setzero_si128();

    for (i=0;i<dataSize16;i+=16)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_FEDCBA98 = _mm_loadu_si128((__m128i *)(&x[i + 8]));
        inVec2_FEDCBA98 = _mm_loadu_si128((__m128i *)(&y[i + 8]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);
        inVec1_FEDCBA98 = _mm_madd_epi16(inVec1_FEDCBA98, inVec2_FEDCBA98);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        acc2 = _mm_add_epi32(acc2, inVec1_FEDCBA98);
    }

    acc1 = _mm_add_epi32( acc1, acc2 );

    if (N - i >= 8)
    {
        inVec1_76543210 = _mm_loadu_si128((__m128i *)(&x[i + 0]));
        inVec2_76543210 = _mm_loadu_si128((__m128i *)(&y[i + 0]));

        inVec1_76543210 = _mm_madd_epi16(inVec1_76543210, inVec2_76543210);

        acc1 = _mm_add_epi32(acc1, inVec1_76543210);
        i += 8;
    }

    acc1 = _mm_add_epi32(acc1, _mm_unpackhi_epi64( acc1, acc1));
    acc1 = _mm_add_epi32(acc1, _mm_shufflelo_epi16( acc1, 0x0E));
    sum += _mm_cvtsi128_si32(acc1);

    for (;i<N;i++) {
        sum = silk_SMLABB(sum, x[i], y[i]);
    }

    return sum;
}
示例#7
0
static WEBP_INLINE uint32_t ClampedAddSubtractFull(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  const __m128i C2 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  const __m128i V1 = _mm_add_epi16(C0, C1);
  const __m128i V2 = _mm_sub_epi16(V1, C2);
  const __m128i b = _mm_packus_epi16(V2, V2);
  const uint32_t output = _mm_cvtsi128_si32(b);
  return output;
}
示例#8
0
文件: main.cpp 项目: CCJY/coliru
void scanCharDataContentwithSTTNI(SAX2Processor* saxProcessor) { 
    unsigned int length = yylim - yycur; 
    unsigned char* data = (unsigned char*)yycur; 
    if( *data == '<' || *data == '&' || *data == ']') return; 
    unsigned int dataLen = 0; 
    // initialize the one byte encoding rule and nonCharaData rule 
    const __m128i asciiCharData = _mm_set_epi8(0,0,0,0,0,0,0x7F,0x5E,0x5C,0x3D, 0x3B,0x27,0x25,0x20,0,0); 
    const __m128i nonCharData = _mm_set_epi8(0,0,0,0,0,0,0,0,0,0,0,0x5D,0x3C,0x26,0x0D,0x0A); 

    do { 
        // special new line processing for ‘x0A’,‘x0D’ 
        if( *data == '\0' ) { 
            saxProcessor->newLine((char*)data); 
            data++; length--; 
        } else if(*data == '\0') { 
            saxProcessor->newLine((char*)data); 
            if( *(data+1) == '\0' ) { 
                data += 2; length -= 2; yycur++; 
            } else { 
                *data = '\0'; data++; length--; 
            } 
        }

        while( length > 0 ) { 
            if( length >= 16 ) dataLen = 16; 
            else dataLen = length; 
            const __m128i mData = _mm_loadu_si128((__m128i*)data); 
            // locate the Character Data part with the nonCharaData characters 
            int index = _mm_cmpestri(nonCharData, 5, mData, dataLen, _SIDD_CMP_EQUAL_ANY); 
            if( index == 0 ) break;
            if( index > dataLen ) index = dataLen; 
            bool shouldBreak = index < dataLen ? true : false; 
            // check the one byte encoding rule(ASCII) 
            unsigned int mask = _mm_cvtsi128_si32(_mm_cmpestrm(asciiCharData, 10, 
                mData, index, _SIDD_CMP_RANGES|_SIDD_MASKED_NEGATIVE_POLARITY)); 
            // if not all hit ASCII, continue to check other Unicode rules 
            if( mask == 0 || recogUnicodeRange(mData, index, ~mask)) { 
                data += index; 
                length -= index; 
                if( shouldBreak ) break;
            } else { 
                break;
            } 
        }

        unsigned int passLen = (char*)data - yycur; 
        if( passLen == 0 ) break; 
        // report Character Data to user 
        saxProcessor->reportCharDataContent(yycur, passLen); 
        yycur += passLen; 
        YYSWITCHBUFFER; 
    } while( length >= STTNISTRLENLIMIT && (*data == '\0' || *data == '\0') ); 
} 
示例#9
0
static void transpose4x4_to_dst(const uint8_t *src, ptrdiff_t src_stride,
                                uint8_t *dst, ptrdiff_t dst_stride) {
  __m128i A = _mm_cvtsi32_si128(*(const int *)src);
  __m128i B = _mm_cvtsi32_si128(*(const int *)(src + src_stride));
  __m128i C = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 2));
  __m128i D = _mm_cvtsi32_si128(*(const int *)(src + src_stride * 3));
  // 00 10 01 11 02 12 03 13
  const __m128i tr0_0 = _mm_unpacklo_epi8(A, B);
  // 20 30 21 31 22 32 23 33
  const __m128i tr0_1 = _mm_unpacklo_epi8(C, D);
  // 00 10 20 30 01 11 21 31 02 12 22 32 03 13 23 33
  A = _mm_unpacklo_epi16(tr0_0, tr0_1);
  B = _mm_srli_si128(A, 4);
  C = _mm_srli_si128(A, 8);
  D = _mm_srli_si128(A, 12);

  *(int *)(dst) = _mm_cvtsi128_si32(A);
  *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(B);
  *(int *)(dst + dst_stride * 2) = _mm_cvtsi128_si32(C);
  *(int *)(dst + dst_stride * 3) = _mm_cvtsi128_si32(D);
}
static void write2pixelsAccum(__m128i *u, int bd, uint16_t *dst) {
  __m128i v = _mm_loadl_epi64((__m128i const *)dst);
  const __m128i ones = _mm_set1_epi16(1);

  highbdRndingPacks(u);
  highbd_clip(u, 1, bd);

  v = _mm_add_epi16(v, u[0]);
  v = _mm_add_epi16(v, ones);
  v = _mm_srai_epi16(v, 1);
  *(uint32_t *)dst = _mm_cvtsi128_si32(v);
}
示例#11
0
    static void ScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
                                        const uint8* u_buf,
                                        const uint8* v_buf,
                                        uint8* rgb_buf,
                                        int width,
                                        int source_dx) {
        __m128i xmm0, xmmY1, xmmY2;
        __m128  xmmY;
        uint8 u, v, y;
        int x = 0;

        while (width >= 2) {
            u = u_buf[x >> 17];
            v = v_buf[x >> 17];
            y = y_buf[x >> 16];
            x += source_dx;

            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

            y = y_buf[x >> 16];
            x += source_dx;

            xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

            xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                                  0x44);
            xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

            _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
            rgb_buf += 8;
            width -= 2;
        }

        if (width) {
            u = u_buf[x >> 17];
            v = v_buf[x >> 17];
            y = y_buf[x >> 16];

            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
            xmmY1 = _mm_srai_epi16(xmmY1, 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
            *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
        }
    }
示例#12
0
// Compute the sum of all pixel differences of this MB.
static INLINE int sum_diff_16x1(__m128i acc_diff) {
  const __m128i k_1 = _mm_set1_epi16(1);
  const __m128i acc_diff_lo =
      _mm_srai_epi16(_mm_unpacklo_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_hi =
      _mm_srai_epi16(_mm_unpackhi_epi8(acc_diff, acc_diff), 8);
  const __m128i acc_diff_16 = _mm_add_epi16(acc_diff_lo, acc_diff_hi);
  const __m128i hg_fe_dc_ba = _mm_madd_epi16(acc_diff_16, k_1);
  const __m128i hgfe_dcba =
      _mm_add_epi32(hg_fe_dc_ba, _mm_srli_si128(hg_fe_dc_ba, 8));
  const __m128i hgfedcba =
      _mm_add_epi32(hgfe_dcba, _mm_srli_si128(hgfe_dcba, 4));
  return _mm_cvtsi128_si32(hgfedcba);
}
示例#13
0
unsigned int aom_get_mb_ss_sse2(const int16_t *src) {
  __m128i vsum = _mm_setzero_si128();
  int i;

  for (i = 0; i < 32; ++i) {
    const __m128i v = _mm_loadu_si128((const __m128i *)src);
    vsum = _mm_add_epi32(vsum, _mm_madd_epi16(v, v));
    src += 8;
  }

  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 8));
  vsum = _mm_add_epi32(vsum, _mm_srli_si128(vsum, 4));
  return _mm_cvtsi128_si32(vsum);
}
// pixelsNum 0: write all 4 pixels
//           1/2/3: residual pixels 1/2/3
static void writePixel(__m128i *u, int width, int pixelsNum, uint16_t *dst,
                       int dst_stride) {
  if (2 == width) {
    if (0 == pixelsNum) {
      *(int *)dst = _mm_cvtsi128_si32(u[0]);
      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
      *(int *)(dst + 3 * dst_stride) = _mm_cvtsi128_si32(u[3]);
    } else if (1 == pixelsNum) {
      *(int *)dst = _mm_cvtsi128_si32(u[0]);
    } else if (2 == pixelsNum) {
      *(int *)dst = _mm_cvtsi128_si32(u[0]);
      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
    } else if (3 == pixelsNum) {
      *(int *)dst = _mm_cvtsi128_si32(u[0]);
      *(int *)(dst + dst_stride) = _mm_cvtsi128_si32(u[1]);
      *(int *)(dst + 2 * dst_stride) = _mm_cvtsi128_si32(u[2]);
    }
  } else {
    if (0 == pixelsNum) {
      _mm_storel_epi64((__m128i *)dst, u[0]);
      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
      _mm_storel_epi64((__m128i *)(dst + 3 * dst_stride), u[3]);
    } else if (1 == pixelsNum) {
      _mm_storel_epi64((__m128i *)dst, u[0]);
    } else if (2 == pixelsNum) {
      _mm_storel_epi64((__m128i *)dst, u[0]);
      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
    } else if (3 == pixelsNum) {
      _mm_storel_epi64((__m128i *)dst, u[0]);
      _mm_storel_epi64((__m128i *)(dst + dst_stride), u[1]);
      _mm_storel_epi64((__m128i *)(dst + 2 * dst_stride), u[2]);
    }
  }
}
示例#15
0
static INLINE int variance_final_from_32bit_sum_avx2(__m256i vsse, __m128i vsum,
                                                     unsigned int *const sse) {
  // extract the low lane and add it to the high lane
  const __m128i sse_reg_128 = mm256_add_hi_lo_epi32(vsse);

  // unpack sse and sum registers and add
  const __m128i sse_sum_lo = _mm_unpacklo_epi32(sse_reg_128, vsum);
  const __m128i sse_sum_hi = _mm_unpackhi_epi32(sse_reg_128, vsum);
  const __m128i sse_sum = _mm_add_epi32(sse_sum_lo, sse_sum_hi);

  // perform the final summation and extract the results
  const __m128i res = _mm_add_epi32(sse_sum, _mm_srli_si128(sse_sum, 8));
  *((int *)sse) = _mm_cvtsi128_si32(res);
  return _mm_extract_epi32(res, 1);
}
示例#16
0
uint32_t
halfsiphash(const unsigned char key[16], const unsigned char *m, size_t len) {
	xmmi k,v02,v20,v13,v11,v33,mi;
	uint32_t last7;
	uint32_t lo, hi;
	size_t i, blocks;

	k = _mm_loadu_si128((xmmi *)(key + 0));
	v02 = siphash_init[0].v;
	v13 = siphash_init[1].v;
	v02 = _mm_xor_si128(v02, _mm_unpacklo_epi64(k, k));
	v13 = _mm_xor_si128(v13, _mm_unpackhi_epi64(k, k));

	last7 = (len & 0xff) << 24;

	for (i = 0, blocks = (len & ~3); i < blocks; i += 4) {
		mi = _mm_loadl_epi64((xmmi *)(m + i));
		v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
		sipcompress()
		sipcompress()
		v02 = _mm_xor_si128(v02, mi);
	}

	switch (len - blocks) {
		case 3: last7 |= (uint32_t)m[i + 2] << 16;
		case 2: last7 |= (uint32_t)m[i + 1] <<  8;
		case 1: last7 |= (uint32_t)m[i + 0]      ;
		case 0:
		default:;
	};

	mi  = _mm_unpacklo_epi32(_mm_cvtsi32_si128(last7),_mm_cvtsi32_si128(0));
	v13 = _mm_xor_si128(v13, _mm_slli_si128(mi, 8));
	sipcompress()
	sipcompress()
	v02 = _mm_xor_si128(v02, mi);
	v02 = _mm_xor_si128(v02, siphash_final.v);
	sipcompress()
	sipcompress()
	sipcompress()
	sipcompress()

	v02 = _mm_xor_si128(v02, v13);
	v02 = _mm_xor_si128(v02, _mm_shuffle_epi32(v02, _MM_SHUFFLE(1,0,3,2)));
	lo  = _mm_cvtsi128_si32(v02);
	return lo;
}
示例#17
0
文件: norm.cpp 项目: cyberCBM/DetectO
int normL1_(const uchar* a, const uchar* b, int n)
{
    int j = 0, d = 0;
#if CV_SSE
    __m128i d0 = _mm_setzero_si128();

    for( ; j <= n - 16; j += 16 )
    {
        __m128i t0 = _mm_loadu_si128((const __m128i*)(a + j));
        __m128i t1 = _mm_loadu_si128((const __m128i*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }

    for( ; j <= n - 4; j += 4 )
    {
        __m128i t0 = _mm_cvtsi32_si128(*(const int*)(a + j));
        __m128i t1 = _mm_cvtsi32_si128(*(const int*)(b + j));

        d0 = _mm_add_epi32(d0, _mm_sad_epu8(t0, t1));
    }
    d = _mm_cvtsi128_si32(_mm_add_epi32(d0, _mm_unpackhi_epi64(d0, d0)));
#elif CV_NEON
    uint32x4_t v_sum = vdupq_n_u32(0.0f);
    for ( ; j <= n - 16; j += 16)
    {
        uint8x16_t v_dst = vabdq_u8(vld1q_u8(a + j), vld1q_u8(b + j));
        uint16x8_t v_low = vmovl_u8(vget_low_u8(v_dst)), v_high = vmovl_u8(vget_high_u8(v_dst));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_low_u16(v_low), vget_low_u16(v_high)));
        v_sum = vaddq_u32(v_sum, vaddl_u16(vget_high_u16(v_low), vget_high_u16(v_high)));
    }

    uint CV_DECL_ALIGNED(16) buf[4];
    vst1q_u32(buf, v_sum);
    d = buf[0] + buf[1] + buf[2] + buf[3];
#endif
    {
        for( ; j <= n - 4; j += 4 )
        {
            d += std::abs(a[j] - b[j]) + std::abs(a[j+1] - b[j+1]) +
            std::abs(a[j+2] - b[j+2]) + std::abs(a[j+3] - b[j+3]);
        }
    }
    for( ; j < n; j++ )
        d += std::abs(a[j] - b[j]);
    return d;
}
示例#18
0
        inline int rand() {
            __m128i split;
            __m128i multi;
            __m128i adder;
            __m128i mmask;
            __m128i smask;
            __m128i store;

            DATA(multi)={0x000343FD,0x000043FD,0x000343FD,0x00010DCD};
            DATA(adder)={0x00269EC3,0x009E9EC3,0x00D19EC3,0x00000001};
            DATA(mmask)={0xFFFFFFFF,0x00000000,0xFFFFFFFF,0x00000000};
            DATA(smask)={0x00007FFF,0x00007FFF,0x00007FFF,0x00007FFF};
            #undef DATA

            adder = _mm_load_si128   ((__m128i*)data_adder);
            multi = _mm_load_si128   ((__m128i*)data_multi);
            mmask = _mm_load_si128   ((__m128i*)data_mmask);
            smask = _mm_load_si128   ((__m128i*)data_smask);

            split = _mm_shuffle_epi32(
                __ccaprice_stdlib_rseed,
                __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            );

            __ccaprice_stdlib_rseed = _mm_mul_epu32(__ccaprice_stdlib_rseed, multi);
            multi                   = _mm_shuffle_epi32(
                multi,
                __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            );
            split                   = _mm_mul_epu32(split, multi);
            __ccaprice_stdlib_rseed = _mm_and_si128(__ccaprice_stdlib_rseed, mmask);
            split                   = _mm_and_si128(split, mmask);
            split                   = _mm_shuffle_epi32(
                split,
                __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            );
            __ccaprice_stdlib_rseed = _mm_or_si128  (__ccaprice_stdlib_rseed, split);
            __ccaprice_stdlib_rseed = _mm_add_epi32 (__ccaprice_stdlib_rseed, adder);
            store                   = _mm_srai_epi32(__ccaprice_stdlib_rseed, 0x10);
            store                   = _mm_and_si128 (store, smask);

            return (unsigned int)_mm_cvtsi128_si32(store);

            #undef __CCAPRICE_STDLIB_RANDOM_SSE_SHUFFLE
            #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS2
            #undef __CCAPRICE_STDLIB_RANDOM_SSE_STAIRS1
        }
示例#19
0
INLINE static unsigned sum_block_avx2(__m128i *ver_row)
{
  __m128i sad = _mm_setzero_si128();
  haddwd_accumulate_avx2(&sad, ver_row + 0);
  haddwd_accumulate_avx2(&sad, ver_row + 1);
  haddwd_accumulate_avx2(&sad, ver_row + 2);
  haddwd_accumulate_avx2(&sad, ver_row + 3); 
  haddwd_accumulate_avx2(&sad, ver_row + 4);
  haddwd_accumulate_avx2(&sad, ver_row + 5);
  haddwd_accumulate_avx2(&sad, ver_row + 6);
  haddwd_accumulate_avx2(&sad, ver_row + 7);

  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(2, 3, 0, 1)));
  sad = _mm_add_epi32(sad, _mm_shuffle_epi32(sad, KVZ_PERMUTE(1, 0, 1, 0)));

  return _mm_cvtsi128_si32(sad);
}
示例#20
0
static WEBP_INLINE uint32_t ClampedAddSubtractHalf(uint32_t c0, uint32_t c1,
                                                   uint32_t c2) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i C0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c0), zero);
  const __m128i C1 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c1), zero);
  const __m128i B0 = _mm_unpacklo_epi8(_mm_cvtsi32_si128(c2), zero);
  const __m128i avg = _mm_add_epi16(C1, C0);
  const __m128i A0 = _mm_srli_epi16(avg, 1);
  const __m128i A1 = _mm_sub_epi16(A0, B0);
  const __m128i BgtA = _mm_cmpgt_epi16(B0, A0);
  const __m128i A2 = _mm_sub_epi16(A1, BgtA);
  const __m128i A3 = _mm_srai_epi16(A2, 1);
  const __m128i A4 = _mm_add_epi16(A0, A3);
  const __m128i A5 = _mm_packus_epi16(A4, A4);
  const uint32_t output = _mm_cvtsi128_si32(A5);
  return output;
}
示例#21
0
文件: image.cpp 项目: Alcaro/Arlib
//lower - usually target
//upper - usually source; its alpha decides how much of the lower color is visible
//always does the full blending operation, does not optimize based on A=FF being true 90% of the time and A=00 90% of the remainder
static inline uint32_t blend_8888_on_8888(uint32_t argb_lower, uint32_t argb_upper)
{
#ifdef __SSE2__
	//no need to extend this above 128bit, it's complex enough without having to consider multiple pixels at once
	
	uint32_t spx = argb_upper;
	uint32_t tpx = argb_lower;
	
	//contains u16: spx.a, spx.b, spx.g, spx.r, tpx.{a,b,g,r}
	__m128i vals = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, spx, tpx), _mm_setzero_si128());
	
	//contains u16: {sa}*4, {255-sa}*4
	__m128i alphas = _mm_xor_si128(_mm_set1_epi16(spx>>24), _mm_set_epi16(0,0,0,0, 255,255,255,255));
	//contains u16: pixel contributions times 255
	__m128i newcols255 = _mm_mullo_epi16(vals, alphas);
	
	//ugly magic constants: (u16)*8081>>16>>7 = (u16)/255
	__m128i newcols = _mm_srli_epi16(_mm_mulhi_epu16(newcols255, _mm_set1_epi16(0x8081)), 7);
	
	//contains u8: {don't care}*8, sac (source alpha contribution), sbc, sgc, src, tac, tbc, tgc, trc
	__m128i newpack = _mm_packus_epi16(newcols, _mm_undefined_si128());
	//contains u8: {don't care}*12, sac+tac = result alpha, sbc+tbc, sgc+tgc, src+trc
	//the components are known to not overflow
	__m128i newpacksum = _mm_add_epi8(newpack, _mm_srli_si128(newpack, 32/8));
	
	return _mm_cvtsi128_si32(newpacksum);
#else
	uint8_t sr = argb_upper>>0;
	uint8_t sg = argb_upper>>8;
	uint8_t sb = argb_upper>>16;
	uint8_t sa = argb_upper>>24;
	
	uint8_t tr = argb_lower>>0;
	uint8_t tg = argb_lower>>8;
	uint8_t tb = argb_lower>>16;
	uint8_t ta = argb_lower>>24;
	
	tr = (sr*sa/255) + (tr*(255-sa)/255);
	tg = (sg*sa/255) + (tg*(255-sa)/255);
	tb = (sb*sa/255) + (tb*(255-sa)/255);
	ta = (sa*sa/255) + (ta*(255-sa)/255);
	
	return ta<<24 | tb<<16 | tg<<8 | tr<<0;
#endif
}
示例#22
0
文件: main.cpp 项目: CCJY/coliru
int countZeroBytes_SSE(char* values, int length) {
    int zeroCount = 0;
    __m128i zero16 = _mm_set1_epi8(0);
    __m128i and16 = _mm_set1_epi8(1);
    for(int i=0; i<length; i+=16) {
        __m128i values16 = _mm_loadu_si128((__m128i*)&values[i]);
        __m128i cmp = _mm_cmpeq_epi8(values16, zero16);
        if(_mm_movemask_epi8(cmp)) {
            cmp = _mm_and_si128(and16, cmp); //change -1 values to 1
            //hortiontal sum of 16 bytes
            __m128i sum1 = _mm_sad_epu8(cmp,zero16);
            __m128i sum2 = _mm_shuffle_epi32(sum1,2);
            __m128i sum3 = _mm_add_epi16(sum1,sum2);
            zeroCount += _mm_cvtsi128_si32(sum3);
        }
    }
    return zeroCount;
}
示例#23
0
/**
 * Adds 4 pixels (in a 2x2 grid) and multiplies them by 2. Resulting in a more
 * precise version of a box filter 4:2:0 pixel subsampling in Q3.
 *
 * The CfL prediction buffer is always of size CFL_BUF_SQUARE. However, the
 * active area is specified using width and height.
 *
 * Note: We don't need to worry about going over the active area, as long as we
 * stay inside the CfL prediction buffer.
 */
static INLINE void cfl_luma_subsampling_420_hbd_ssse3(const uint16_t *input,
                                                      int input_stride,
                                                      uint16_t *pred_buf_q3,
                                                      int width, int height) {
  const uint16_t *end = pred_buf_q3 + (height >> 1) * CFL_BUF_LINE;
  const int luma_stride = input_stride << 1;
  do {
    if (width == 4) {
      const __m128i top = _mm_loadl_epi64((__m128i *)input);
      const __m128i bot = _mm_loadl_epi64((__m128i *)(input + input_stride));
      __m128i sum = _mm_add_epi16(top, bot);
      sum = _mm_hadd_epi16(sum, sum);
      *((int *)pred_buf_q3) = _mm_cvtsi128_si32(_mm_add_epi16(sum, sum));
    } else {
      const __m128i top = _mm_loadu_si128((__m128i *)input);
      const __m128i bot = _mm_loadu_si128((__m128i *)(input + input_stride));
      __m128i sum = _mm_add_epi16(top, bot);
      if (width == 8) {
        sum = _mm_hadd_epi16(sum, sum);
        _mm_storel_epi64((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
      } else {
        const __m128i top_1 = _mm_loadu_si128(((__m128i *)input) + 1);
        const __m128i bot_1 =
            _mm_loadu_si128(((__m128i *)(input + input_stride)) + 1);
        sum = _mm_hadd_epi16(sum, _mm_add_epi16(top_1, bot_1));
        _mm_storeu_si128((__m128i *)pred_buf_q3, _mm_add_epi16(sum, sum));
        if (width == 32) {
          const __m128i top_2 = _mm_loadu_si128(((__m128i *)input) + 2);
          const __m128i bot_2 =
              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 2);
          const __m128i top_3 = _mm_loadu_si128(((__m128i *)input) + 3);
          const __m128i bot_3 =
              _mm_loadu_si128(((__m128i *)(input + input_stride)) + 3);
          const __m128i sum_2 = _mm_add_epi16(top_2, bot_2);
          const __m128i sum_3 = _mm_add_epi16(top_3, bot_3);
          __m128i next_sum = _mm_hadd_epi16(sum_2, sum_3);
          _mm_storeu_si128(((__m128i *)pred_buf_q3) + 1,
                           _mm_add_epi16(next_sum, next_sum));
        }
      }
    }
    input += luma_stride;
  } while ((pred_buf_q3 += CFL_BUF_LINE) < end);
}
示例#24
0
//FINL 
int16 __ext_v_sum_int16(int16* x, int len)
{
	__m128i msum = _mm_setzero_si128();

#ifdef SORA_PLATFORM
	__int16 ret;
#else
	int16_t ret;
#endif

	const int wlen = 8;

	for (int i = 0; i < len / wlen; i++)
	{
		__m128i mx = _mm_loadu_si128((__m128i *)(x + wlen*i));
		msum = _mm_add_epi16(msum, mx);
	}

	__m128i mout = msum;


	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi16(mout, msum);

	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi16(mout, msum);

	msum = _mm_shuffle_epi32(msum, _MM_SHUFFLE(2, 1, 0, 3));
	mout = _mm_add_epi16(mout, msum);


	unsigned int temp = _mm_cvtsi128_si32(mout);
	ret = temp;
	
	ret += (temp >> 16);

	for (int i = (len / wlen) * wlen; i < len; i++)
	{
		ret += x[i];
	}

	return ret;

}
static void aom_filter_block1d4_h4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pixels_per_line, uint8_t *output_ptr,
    ptrdiff_t output_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32, filt1Reg, firstFilters, srcReg32b1, srcRegFilt32b1_1;
  unsigned int i;
  src_ptr -= 3;
  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  // converting the 16 bit (short) to 8 bit (byte) and have the same data
  // in both lanes of 128 bit register.
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));
  filt1Reg = _mm_load_si128((__m128i const *)(filtd4));

  for (i = output_height; i > 0; i -= 1) {
    // load the 2 strides of source
    srcReg32b1 = _mm_loadu_si128((const __m128i *)src_ptr);

    // filter the source buffer
    srcRegFilt32b1_1 = _mm_shuffle_epi8(srcReg32b1, filt1Reg);

    // multiply 4 adjacent elements with the filter and add the result
    srcRegFilt32b1_1 = _mm_maddubs_epi16(srcRegFilt32b1_1, firstFilters);

    srcRegFilt32b1_1 = _mm_hadds_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    // shift by 6 bit each 16 bit
    srcRegFilt32b1_1 = _mm_adds_epi16(srcRegFilt32b1_1, addFilterReg32);
    srcRegFilt32b1_1 = _mm_srai_epi16(srcRegFilt32b1_1, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve result
    srcRegFilt32b1_1 = _mm_packus_epi16(srcRegFilt32b1_1, _mm_setzero_si128());

    src_ptr += src_pixels_per_line;

    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(srcRegFilt32b1_1);
    output_ptr += output_pitch;
  }
}
示例#26
0
void bitund132(unsigned *p, unsigned n, unsigned x) { 
    #ifdef __SSE2__
  __m128i sv = _mm_set1_epi32(x), cv = _mm_set_epi32(4,3,2,1);
  unsigned *ip;
  for(ip = p; ip != p+(n&~(4-1)); ) {
    __m128i v =  _mm_loadu_si128((__m128i *)ip); 
	SCANI128_32(v, sv, cv); 
	_mm_storeu_si128((__m128i *)ip, sv); 
	ip += 4;
  }
  x = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  while(ip < p+n) { 
    *ip = (x += (*ip) + 1); 
	ip++; 
  }
    #else
  BITUNDELTA(p, n, x, 1); 
    #endif
}
示例#27
0
unsigned bitdelta32(unsigned *in, unsigned n, unsigned *out, unsigned start, unsigned inc) {
    #ifdef __SSE2__
  unsigned *ip,b,*op = out; 
  __m128i bv = _mm_setzero_si128(), sv = _mm_set1_epi32(start), cv = _mm_set1_epi32(inc), dv;
  for(ip = in; ip != in+(n&~(4-1)); ip += 4) { 
    __m128i iv = _mm_loadu_si128((__m128i *)ip); 
	bv = _mm_or_si128(bv, dv = _mm_sub_epi32(DELTA128_32(iv,sv),cv)); 
	sv = iv; 
	_mm_storeu_si128((__m128i *)op, dv); 
	op += 4; 
  }
  start = (unsigned)_mm_cvtsi128_si32(_mm_srli_si128(sv,12));
  HOR128_32(bv, b);
  while(ip < in+n) { unsigned x = *ip-start-inc; start = *ip++; b |= x; *op++ = x; }
    #else
  typeof(in[0]) b = 0,*op = out; BITDELTA(in, n, inc, start, b |= _x;*op++ = _x);
    #endif
  return bsr32(b);
}
static INLINE unsigned int masked_sad4xh_ssse3(
    const uint8_t *src_ptr, int src_stride, const uint8_t *a_ptr, int a_stride,
    const uint8_t *b_ptr, int b_stride, const uint8_t *m_ptr, int m_stride,
    int height) {
  int y;
  __m128i res = _mm_setzero_si128();
  const __m128i mask_max = _mm_set1_epi8((1 << AOM_BLEND_A64_ROUND_BITS));

  for (y = 0; y < height; y += 2) {
    // Load two rows at a time, this seems to be a bit faster
    // than four rows at a time in this case.
    const __m128i src = _mm_unpacklo_epi32(
        _mm_cvtsi32_si128(*(uint32_t *)src_ptr),
        _mm_cvtsi32_si128(*(uint32_t *)&src_ptr[src_stride]));
    const __m128i a =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)a_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&a_ptr[a_stride]));
    const __m128i b =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)b_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&b_ptr[b_stride]));
    const __m128i m =
        _mm_unpacklo_epi32(_mm_cvtsi32_si128(*(uint32_t *)m_ptr),
                           _mm_cvtsi32_si128(*(uint32_t *)&m_ptr[m_stride]));
    const __m128i m_inv = _mm_sub_epi8(mask_max, m);

    const __m128i data = _mm_unpacklo_epi8(a, b);
    const __m128i mask = _mm_unpacklo_epi8(m, m_inv);
    __m128i pred_16bit = _mm_maddubs_epi16(data, mask);
    pred_16bit = xx_roundn_epu16(pred_16bit, AOM_BLEND_A64_ROUND_BITS);

    const __m128i pred = _mm_packus_epi16(pred_16bit, _mm_setzero_si128());
    res = _mm_add_epi32(res, _mm_sad_epu8(pred, src));

    src_ptr += src_stride * 2;
    a_ptr += a_stride * 2;
    b_ptr += b_stride * 2;
    m_ptr += m_stride * 2;
  }
  // At this point, the SAD is stored in lane 0 of 'res'
  int32_t sad = _mm_cvtsi128_si32(res);
  return (sad + 31) >> 6;
}
示例#29
0
void ie_FillLine(iePwL pDst, DWORD nXW, iewL clr)
{
#ifndef __X64__
	if (g_bSSE2 && (nXW >= 16)) {
#else
	if (nXW >= 16) {
#endif
		// Do fill using SSE2!

		while (nXW) {												// Fill until destination is aligned
			if (_mm_isAligned(pDst)) break;
			*pDst++ = clr;
			nXW--;
		}

		__m128i r0 = _mm_set1_epi16(clr);

		for (DWORD nXW_8 = nXW >> 3; nXW_8--;) {
			_mm_store_si128((__m128i *)pDst, r0);
			pDst += 8;
		}

		if (nXW & 4) {
			_mm_storel_epi64((__m128i *)pDst, r0);
			pDst += 4;
		}

		if (nXW & 2) {
			*PDWORD(pDst) = _mm_cvtsi128_si32(r0);
			pDst += 2;
		}

		if (nXW & 1) {
			*PWORD(pDst) = clr;
		}

		return;
	}

	while (nXW--)
		*pDst++ = clr;
}
示例#30
0
inline Pixel GetPixelSSE(const Image* img, float x, float y)
{
 const int stride = img->width;
 const Pixel* p0 = img->data + (int)x + (int)y * stride; // pointer to first pixel

 // Load the data (2 pixels in one load)
 __m128i p12 = _mm_loadl_epi64((const __m128i*)&p0[0 * stride]); 
 __m128i p34 = _mm_loadl_epi64((const __m128i*)&p0[1 * stride]); 

 __m128 weight = CalcWeights(x, y);

 // extend to 16bit
 p12 = _mm_unpacklo_epi8(p12, _mm_setzero_si128());
 p34 = _mm_unpacklo_epi8(p34, _mm_setzero_si128());

 // convert floating point weights to 16bit integer
 weight = _mm_mul_ps(weight, CONST_256); 
 __m128i weighti = _mm_cvtps_epi32(weight); // w4 w3 w2 w1
         weighti = _mm_packs_epi32(weighti, _mm_setzero_si128()); // 32->16bit

 // prepare the weights
 __m128i w12 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(1, 1, 0, 0));
 __m128i w34 = _mm_shufflelo_epi16(weighti, _MM_SHUFFLE(3, 3, 2, 2));
 w12 = _mm_unpacklo_epi16(w12, w12); // w2 w2 w2 w2 w1 w1 w1 w1
 w34 = _mm_unpacklo_epi16(w34, w34); // w4 w4 w4 w4 w3 w3 w3 w3
 
 // multiply each pixel with its weight (2 pixel per SSE mul)
 __m128i L12 = _mm_mullo_epi16(p12, w12);
 __m128i L34 = _mm_mullo_epi16(p34, w34);

 // sum the results
 __m128i L1234 = _mm_add_epi16(L12, L34); 
 __m128i Lhi = _mm_shuffle_epi32(L1234, _MM_SHUFFLE(3, 2, 3, 2));
 __m128i L = _mm_add_epi16(L1234, Lhi);
  
 // convert back to 8bit
 __m128i L8 = _mm_srli_epi16(L, 8); // divide by 256
 L8 = _mm_packus_epi16(L8, _mm_setzero_si128());
 
 // return
 return _mm_cvtsi128_si32(L8);
}