Exemple #1
/* Routine optimized for unshuffling a buffer for a type size of 16 bytes. */
static void
unshuffle16(uint8_t* dest, uint8_t* orig, size_t size)
  size_t i, j, k;
  size_t neblock, numof16belem;
  __m128i xmm1[16], xmm2[16];

  neblock = size / 16;
  numof16belem = neblock / 16;
  for (i = 0, k = 0; i < numof16belem; i++, k += 16) {
    /* Load the first 128 bytes in 16 XMM registrers */
    for (j = 0; j < 16; j++) {
      xmm1[j] = ((__m128i *)orig)[j*numof16belem+i];
    /* Shuffle bytes */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm2[j] = _mm_unpacklo_epi8(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm2[8+j] = _mm_unpackhi_epi8(xmm1[j*2], xmm1[j*2+1]);
    /* Shuffle 2-byte words */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi16(xmm2[j*2], xmm2[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[8+j] = _mm_unpackhi_epi16(xmm2[j*2], xmm2[j*2+1]);
    /* Shuffle 4-byte dwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm2[j] = _mm_unpacklo_epi32(xmm1[j*2], xmm1[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm2[8+j] = _mm_unpackhi_epi32(xmm1[j*2], xmm1[j*2+1]);
    /* Shuffle 8-byte qwords */
    for (j = 0; j < 8; j++) {
      /* Compute the low 32 bytes */
      xmm1[j] = _mm_unpacklo_epi64(xmm2[j*2], xmm2[j*2+1]);
      /* Compute the hi 32 bytes */
      xmm1[8+j] = _mm_unpackhi_epi64(xmm2[j*2], xmm2[j*2+1]);
    /* Store the result vectors in proper order */
    ((__m128i *)dest)[k+0] = xmm1[0];
    ((__m128i *)dest)[k+1] = xmm1[8];
    ((__m128i *)dest)[k+2] = xmm1[4];
    ((__m128i *)dest)[k+3] = xmm1[12];
    ((__m128i *)dest)[k+4] = xmm1[2];
    ((__m128i *)dest)[k+5] = xmm1[10];
    ((__m128i *)dest)[k+6] = xmm1[6];
    ((__m128i *)dest)[k+7] = xmm1[14];
    ((__m128i *)dest)[k+8] = xmm1[1];
    ((__m128i *)dest)[k+9] = xmm1[9];
    ((__m128i *)dest)[k+10] = xmm1[5];
    ((__m128i *)dest)[k+11] = xmm1[13];
    ((__m128i *)dest)[k+12] = xmm1[3];
    ((__m128i *)dest)[k+13] = xmm1[11];
    ((__m128i *)dest)[k+14] = xmm1[7];
    ((__m128i *)dest)[k+15] = xmm1[15];
Exemple #2
void EmitColorIndices_Intrinsics( const byte *colorBlock, const byte *minColor, const byte *maxColor, byte *&outData )
	ALIGN16( byte color0[16] );
	ALIGN16( byte color1[16] );
	ALIGN16( byte color2[16] );
	ALIGN16( byte color3[16] );
	ALIGN16( byte result[16] );
	// mov esi, maxColor
	// mov edi, minColor

	__m128i t0, t1, t2, t3, t4, t5, t6, t7;

	t7 = _mm_setzero_si128();
	//t7 = _mm_xor_si128(t7, t7);
	_mm_store_si128 ( (__m128i*) &result, t7 );

	//t0 = _mm_load_si128 ( (__m128i*)  maxColor );
	t0 = _mm_cvtsi32_si128( *(int*)maxColor);

	// Bitwise AND
	__m128i tt = _mm_load_si128 ( (__m128i*) SIMD_SSE2_byte_colorMask );
	t0 = _mm_and_si128(t0, tt);

	t0 = _mm_unpacklo_epi8(t0, t7);

	t4 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t0, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	// Bitwise Logical OR
	t0 = _mm_or_si128(t0, t4);
	t0 = _mm_or_si128(t0, t5);   // t0 contains color0 in 565

	//t1 = _mm_load_si128 ( (__m128i*)  minColor );
	t1 = _mm_cvtsi32_si128( *(int*)minColor);

	t1 = _mm_and_si128(t1, tt);

	t1 = _mm_unpacklo_epi8(t1, t7);

	t4 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 0, 3, 2, 3 ));
	t5 = _mm_shufflelo_epi16( t1, R_SHUFFLE_D( 3, 1, 3, 3 ));

	t4 = _mm_srli_epi16(t4, 5);
	t5 = _mm_srli_epi16(t5, 6);

	t1 = _mm_or_si128(t1, t4);
	t1 = _mm_or_si128(t1, t5);  // t1 contains color1 in 565

	t2 = t0;

	t2 = _mm_packus_epi16(t2, t7);

	t2 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color0, t2 );

	t6 = t0;
	t6 = _mm_add_epi16(t6, t0);
	t6 = _mm_add_epi16(t6, t1);

	// Multiply Packed Signed Integers and Store High Result
	__m128i tw3 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_div_by_3 );
	t6 = _mm_mulhi_epi16(t6, tw3);
	t6 = _mm_packus_epi16(t6, t7);

	t6 = _mm_shuffle_epi32( t6, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color2, t6 );

	t3 = t1;
	t3 = _mm_packus_epi16(t3, t7);
	t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 1, 0, 1 ));

	_mm_store_si128 ( (__m128i*) &color1, t3 );

	t1 = _mm_add_epi16(t1, t1);
	t0 = _mm_add_epi16(t0, t1);

	t0 = _mm_mulhi_epi16(t0, tw3);
	t0 = _mm_packus_epi16(t0, t7);

	t0 = _mm_shuffle_epi32( t0, R_SHUFFLE_D( 0, 1, 0, 1 ));
	_mm_store_si128 ( (__m128i*) &color3, t0 );

	__m128i w0 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_0);
	__m128i w1 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_1);
	__m128i w2 = _mm_load_si128 ( (__m128i*) SIMD_SSE2_word_2);

	    // mov eax, 32
	    // mov esi, colorBlock
	int x = 32;
	//const byte *c = colorBlock;
	while (x >= 0)
	    t3 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+0));
	    t3 = _mm_shuffle_epi32( t3, R_SHUFFLE_D( 0, 2, 1, 3 ));
	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+8));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t0 = t3;
	    t6 = t5;
	    // Compute Sum of Absolute Difference
	    __m128i c0 = _mm_load_si128 ( (__m128i*)  color0 );
	    t0 = _mm_sad_epu8(t0, c0);
	    t6 = _mm_sad_epu8(t6, c0);
	    // Pack with Signed Saturation 
	    t0 = _mm_packs_epi32 (t0, t6);

	    t1 = t3;
	    t6 = t5;
	    __m128i c1 = _mm_load_si128 ( (__m128i*)  color1 );
	    t1 = _mm_sad_epu8(t1, c1);
	    t6 = _mm_sad_epu8(t6, c1);
	    t1 = _mm_packs_epi32 (t1, t6);

	    t2 = t3;
	    t6 = t5;
	    __m128i c2 = _mm_load_si128 ( (__m128i*)  color2 );
	    t2 = _mm_sad_epu8(t2, c2);
	    t6 = _mm_sad_epu8(t6, c2);
	    t2 = _mm_packs_epi32 (t2, t6);

	    __m128i c3 = _mm_load_si128 ( (__m128i*)  color3 );
	    t3 = _mm_sad_epu8(t3, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t3 = _mm_packs_epi32 (t3, t5);

	    t4 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+16));
	    t4 = _mm_shuffle_epi32( t4, R_SHUFFLE_D( 0, 2, 1, 3 ));
	    t5 = _mm_loadl_epi64( (__m128i*) (colorBlock+x+24));
	    t5 = _mm_shuffle_epi32( t5, R_SHUFFLE_D( 0, 2, 1, 3 ));

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c0);
	    t7 = _mm_sad_epu8(t7, c0);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t0 = _mm_packs_epi32 (t0, t6);  // d0

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c1);
	    t7 = _mm_sad_epu8(t7, c1);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t1 = _mm_packs_epi32 (t1, t6);  // d1

	    t6 = t4;
	    t7 = t5;
	    t6 = _mm_sad_epu8(t6, c2);
	    t7 = _mm_sad_epu8(t7, c2);
	    t6 = _mm_packs_epi32 (t6, t7);
	    t2 = _mm_packs_epi32 (t2, t6);  // d2

	    t4 = _mm_sad_epu8(t4, c3);
	    t5 = _mm_sad_epu8(t5, c3);
	    t4 = _mm_packs_epi32 (t4, t5);
	    t3 = _mm_packs_epi32 (t3, t4);  // d3

	    t7 = _mm_load_si128 ( (__m128i*) result );

	    t7 = _mm_slli_epi32( t7, 16);

	    t4 = t0;
	    t5 = t1;
	    // Compare Packed Signed Integers for Greater Than
	    t0 = _mm_cmpgt_epi16(t0, t3); // b0
	    t1 = _mm_cmpgt_epi16(t1, t2); // b1
	    t4 = _mm_cmpgt_epi16(t4, t2); // b2
	    t5 = _mm_cmpgt_epi16(t5, t3); // b3
	    t2 = _mm_cmpgt_epi16(t2, t3); // b4
	    t4 = _mm_and_si128(t4, t1); // x0
	    t5 = _mm_and_si128(t5, t0); // x1
	    t2 = _mm_and_si128(t2, t0); // x2

	    t4 = _mm_or_si128(t4, t5);
	    t2 = _mm_and_si128(t2, w1);
	    t4 = _mm_and_si128(t4, w2);
	    t2 = _mm_or_si128(t2, t4);

	    t5 = _mm_shuffle_epi32( t2, R_SHUFFLE_D( 2, 3, 0, 1 ));

	    // Unpack Low Data
	    t2 = _mm_unpacklo_epi16 ( t2, w0);
	    t5 = _mm_unpacklo_epi16 ( t5, w0);

	    //t5 = _mm_slli_si128 ( t5, 8);
	    t5 = _mm_slli_epi32( t5, 8);

	    t7 = _mm_or_si128(t7, t5);
	    t7 = _mm_or_si128(t7, t2);

	    _mm_store_si128 ( (__m128i*) &result, t7 );

	    x -=32;

	t4 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 1, 2, 3, 0 ));
	t5 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 2, 3, 0, 1 ));
	t6 = _mm_shuffle_epi32( t7, R_SHUFFLE_D( 3, 0, 1, 2 ));

	t4 = _mm_slli_epi32 ( t4, 2);
	t5 = _mm_slli_epi32 ( t5, 4);
	t6 = _mm_slli_epi32 ( t6, 6);

	t7 = _mm_or_si128(t7, t4);
	t7 = _mm_or_si128(t7, t5);
	t7 = _mm_or_si128(t7, t6);

	//_mm_store_si128 ( (__m128i*) outData, t7 );

	int r = _mm_cvtsi128_si32 (t7);
	memcpy(outData, &r, 4);   // Anything better ?

	outData += 4;
Exemple #3
// Does one or two inverse transforms.
static void ITransform(const uint8_t* ref, const int16_t* in, uint8_t* dst,
                       int do_two) {
  // This implementation makes use of 16-bit fixed point versions of two
  // multiply constants:
  //    K1 = sqrt(2) * cos (pi/8) ~= 85627 / 2^16
  //    K2 = sqrt(2) * sin (pi/8) ~= 35468 / 2^16
  // To be able to use signed 16-bit integers, we use the following trick to
  // have constants within range:
  // - Associated constants are obtained by subtracting the 16-bit fixed point
  //   version of one:
  //      k = K - (1 << 16)  =>  K = k + (1 << 16)
  //      K1 = 85267  =>  k1 =  20091
  //      K2 = 35468  =>  k2 = -30068
  // - The multiplication of a variable by a constant become the sum of the
  //   variable and the multiplication of that variable by the associated
  //   constant:
  //      (x * K) >> 16 = (x * (k + (1 << 16))) >> 16 = ((x * k ) >> 16) + x
  const __m128i k1 = _mm_set1_epi16(20091);
  const __m128i k2 = _mm_set1_epi16(-30068);
  __m128i T0, T1, T2, T3;

  // Load and concatenate the transform coefficients (we'll do two inverse
  // transforms in parallel). In the case of only one inverse transform, the
  // second half of the vectors will just contain random value we'll never
  // use nor store.
  __m128i in0, in1, in2, in3;
    in0 = _mm_loadl_epi64((const __m128i*)&in[0]);
    in1 = _mm_loadl_epi64((const __m128i*)&in[4]);
    in2 = _mm_loadl_epi64((const __m128i*)&in[8]);
    in3 = _mm_loadl_epi64((const __m128i*)&in[12]);
    // a00 a10 a20 a30   x x x x
    // a01 a11 a21 a31   x x x x
    // a02 a12 a22 a32   x x x x
    // a03 a13 a23 a33   x x x x
    if (do_two) {
      const __m128i inB0 = _mm_loadl_epi64((const __m128i*)&in[16]);
      const __m128i inB1 = _mm_loadl_epi64((const __m128i*)&in[20]);
      const __m128i inB2 = _mm_loadl_epi64((const __m128i*)&in[24]);
      const __m128i inB3 = _mm_loadl_epi64((const __m128i*)&in[28]);
      in0 = _mm_unpacklo_epi64(in0, inB0);
      in1 = _mm_unpacklo_epi64(in1, inB1);
      in2 = _mm_unpacklo_epi64(in2, inB2);
      in3 = _mm_unpacklo_epi64(in3, inB3);
      // a00 a10 a20 a30   b00 b10 b20 b30
      // a01 a11 a21 a31   b01 b11 b21 b31
      // a02 a12 a22 a32   b02 b12 b22 b32
      // a03 a13 a23 a33   b03 b13 b23 b33

  // Vertical pass and subsequent transpose.
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i a = _mm_add_epi16(in0, in2);
    const __m128i b = _mm_sub_epi16(in0, in2);
    // c = MUL(in1, K2) - MUL(in3, K1) = MUL(in1, k2) - MUL(in3, k1) + in1 - in3
    const __m128i c1 = _mm_mulhi_epi16(in1, k2);
    const __m128i c2 = _mm_mulhi_epi16(in3, k1);
    const __m128i c3 = _mm_sub_epi16(in1, in3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(in1, K1) + MUL(in3, K2) = MUL(in1, k1) + MUL(in3, k2) + in1 + in3
    const __m128i d1 = _mm_mulhi_epi16(in1, k1);
    const __m128i d2 = _mm_mulhi_epi16(in3, k2);
    const __m128i d3 = _mm_add_epi16(in1, in3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);

    // Transpose the two 4x4.
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(tmp0, tmp1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(tmp2, tmp3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(tmp0, tmp1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(tmp2, tmp3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33

  // Horizontal pass and subsequent transpose.
    // First pass, c and d calculations are longer because of the "trick"
    // multiplications.
    const __m128i four = _mm_set1_epi16(4);
    const __m128i dc = _mm_add_epi16(T0, four);
    const __m128i a =  _mm_add_epi16(dc, T2);
    const __m128i b =  _mm_sub_epi16(dc, T2);
    // c = MUL(T1, K2) - MUL(T3, K1) = MUL(T1, k2) - MUL(T3, k1) + T1 - T3
    const __m128i c1 = _mm_mulhi_epi16(T1, k2);
    const __m128i c2 = _mm_mulhi_epi16(T3, k1);
    const __m128i c3 = _mm_sub_epi16(T1, T3);
    const __m128i c4 = _mm_sub_epi16(c1, c2);
    const __m128i c = _mm_add_epi16(c3, c4);
    // d = MUL(T1, K1) + MUL(T3, K2) = MUL(T1, k1) + MUL(T3, k2) + T1 + T3
    const __m128i d1 = _mm_mulhi_epi16(T1, k1);
    const __m128i d2 = _mm_mulhi_epi16(T3, k2);
    const __m128i d3 = _mm_add_epi16(T1, T3);
    const __m128i d4 = _mm_add_epi16(d1, d2);
    const __m128i d = _mm_add_epi16(d3, d4);

    // Second pass.
    const __m128i tmp0 = _mm_add_epi16(a, d);
    const __m128i tmp1 = _mm_add_epi16(b, c);
    const __m128i tmp2 = _mm_sub_epi16(b, c);
    const __m128i tmp3 = _mm_sub_epi16(a, d);
    const __m128i shifted0 = _mm_srai_epi16(tmp0, 3);
    const __m128i shifted1 = _mm_srai_epi16(tmp1, 3);
    const __m128i shifted2 = _mm_srai_epi16(tmp2, 3);
    const __m128i shifted3 = _mm_srai_epi16(tmp3, 3);

    // Transpose the two 4x4.
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(shifted0, shifted1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(shifted2, shifted3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(shifted0, shifted1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(shifted2, shifted3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    T0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    T1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    T2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    T3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33

  // Add inverse transform to 'ref' and store.
    const __m128i zero = _mm_setzero_si128();
    // Load the reference(s).
    __m128i ref0, ref1, ref2, ref3;
    if (do_two) {
      // Load eight bytes/pixels per line.
      ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
      ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
      ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
      ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
    } else {
      // Load four bytes/pixels per line.
      ref0 = _mm_cvtsi32_si128(*(const int*)&ref[0 * BPS]);
      ref1 = _mm_cvtsi32_si128(*(const int*)&ref[1 * BPS]);
      ref2 = _mm_cvtsi32_si128(*(const int*)&ref[2 * BPS]);
      ref3 = _mm_cvtsi32_si128(*(const int*)&ref[3 * BPS]);
    // Convert to 16b.
    ref0 = _mm_unpacklo_epi8(ref0, zero);
    ref1 = _mm_unpacklo_epi8(ref1, zero);
    ref2 = _mm_unpacklo_epi8(ref2, zero);
    ref3 = _mm_unpacklo_epi8(ref3, zero);
    // Add the inverse transform(s).
    ref0 = _mm_add_epi16(ref0, T0);
    ref1 = _mm_add_epi16(ref1, T1);
    ref2 = _mm_add_epi16(ref2, T2);
    ref3 = _mm_add_epi16(ref3, T3);
    // Unsigned saturate to 8b.
    ref0 = _mm_packus_epi16(ref0, ref0);
    ref1 = _mm_packus_epi16(ref1, ref1);
    ref2 = _mm_packus_epi16(ref2, ref2);
    ref3 = _mm_packus_epi16(ref3, ref3);
    // Store the results.
    if (do_two) {
      // Store eight bytes/pixels per line.
      _mm_storel_epi64((__m128i*)&dst[0 * BPS], ref0);
      _mm_storel_epi64((__m128i*)&dst[1 * BPS], ref1);
      _mm_storel_epi64((__m128i*)&dst[2 * BPS], ref2);
      _mm_storel_epi64((__m128i*)&dst[3 * BPS], ref3);
    } else {
      // Store four bytes/pixels per line.
      *((int32_t *)&dst[0 * BPS]) = _mm_cvtsi128_si32(ref0);
      *((int32_t *)&dst[1 * BPS]) = _mm_cvtsi128_si32(ref1);
      *((int32_t *)&dst[2 * BPS]) = _mm_cvtsi128_si32(ref2);
      *((int32_t *)&dst[3 * BPS]) = _mm_cvtsi128_si32(ref3);
	mlib_s32 *z,
	const mlib_s16 *x,
	mlib_s32 n)
	if (n < 1)
		return (MLIB_FAILURE);

	mlib_s32 i, ax, az, nstep, n1, n2, n3;
	mlib_s16 *px = (mlib_s16 *)x;
	mlib_s32 *pz = (mlib_s32 *)z;
	__m128i xbuf, zlo, zhi, zero;
	zero = _mm_setzero_si128();

	ax = (mlib_addr)x & 15;
	az = (mlib_addr)z & 15;

	nstep = 16 / sizeof (mlib_s16);
	n1 = ((16 - ax) & 15) / sizeof (mlib_s16);
	n2 = (n - n1) / nstep;
	n3 = n - n1 - n2 * nstep;

	if (n2 < 1) {
		for (i = 0; i < n; i++) {
			*pz++ = *px++;
	} else {
		for (i = 0; i < n1; i++) {
			*pz++ = *px++;

		if ((ax * 2 & 15) == az) {
			for (i = 0; i < n2; i++) {
				xbuf = _mm_load_si128((__m128i *)px);
				zlo = _mm_unpacklo_epi16(zero, xbuf);
				zhi = _mm_unpackhi_epi16(zero, xbuf);
				zlo = _mm_srai_epi32(zlo, 16);
				zhi = _mm_srai_epi32(zhi, 16);
				_mm_store_si128((__m128i *)pz, zlo);
				_mm_store_si128((__m128i *)pz + 1, zhi);
				px += nstep;
				pz += nstep;
		} else {
			for (i = 0; i < n2; i++) {
				xbuf = _mm_load_si128((__m128i *)px);
				zlo = _mm_unpacklo_epi16(zero, xbuf);
				zhi = _mm_unpackhi_epi16(zero, xbuf);
				zlo = _mm_srai_epi32(zlo, 16);
				zhi = _mm_srai_epi32(zhi, 16);
				_mm_storeu_si128((__m128i *)pz, zlo);
				_mm_storeu_si128((__m128i *)pz + 1, zhi);
				px += nstep;
				pz += nstep;

		for (i = 0; i < n3; i++) {
			*pz++ = *px++;

	return (MLIB_SUCCESS);
Exemple #5
 FourColorVals(const uint16_t *p) {
     assert(((uintptr_t)p & 7) == 0);//assert aligned
     vec = _mm_unpacklo_epi16(_mm_loadl_epi64((__m128i*)p),_mm_setzero_si128());
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride , width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2

        for (int x = 0; x < width; x += 16) {
            __m128i sum[4] = { zero, zero, zero, zero };

            for (int i = 0; i < 25; i++) {
                __m128i xmm0, xmm1, xmm2;

                xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));

            for (int i = 0; i < 4; i++) {
                __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                sumfp = _mm_mul_ps(sumfp, rdiv);
                sumfp = _mm_add_ps(sumfp, bias);
                if (!ch->saturate) {
                    sumfp = mm_abs_ps(sumfp);
                sum[i] = _mm_cvttps_epi32(sumfp);

            sum[0] = _mm_packs_epi32(sum[0], sum[1]);
            sum[1] = _mm_packs_epi32(sum[2], sum[3]);
            sum[0] = _mm_packus_epi16(sum[0], sum[1]);

            _mm_store_si128((__m128i *)(dstp + x), sum[0]);
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
global_sse2_word(int                  queryLength,
                 unsigned short      *profile,
                 const unsigned char *dbSeq,
                 int                  dbLength,
                 unsigned short       gapOpen,
                 unsigned short       gapExtend,
                 unsigned short       ceiling,
                 struct f_struct     *f_str)
  int     i, j;

  int     score;
  int     scale;
  int     temp;
  int     distance;

  int     offset;
  int     position;

  int     cmp;
  int     iter;
  __m128i *pvH;
  __m128i *pvE;

  __m128i vE, vF, vH;
  __m128i vHNext;
  __m128i vFPrev;

  __m128i vGapOpen;
  __m128i vGapExtend;
  __m128i vCeiling;

  __m128i vScale;
  __m128i vScaleAmt;
  __m128i vScaleTmp;

  __m128i vTemp;
  __m128i vNull;

  __m128i *pvScore;

  scale = 0;
  iter = (queryLength + 7) / 8;
  offset = (queryLength - 1) % iter;
  position = 7 - (queryLength - 1) / iter;

  pvH = (__m128i *)f_str->workspace;
  pvE = pvH + iter;

  /* Load gap opening penalty to all elements of a constant */
  vGapOpen = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
  vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
  vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);

  /* Load gap extension penalty to all elements of a constant */
  vGapExtend = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
  vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
  vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);

  /* Generate the ceiling before scaling */
  vTemp = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
  vTemp = _mm_shufflelo_epi16 (vTemp, 0);
  vTemp = _mm_shuffle_epi32 (vTemp, 0);
  vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
  vCeiling = _mm_srli_epi16 (vCeiling, 1);
  vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
  vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);

  vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
  vNull = _mm_slli_epi16 (vNull, 15);
  vScaleAmt = _mm_xor_si128 (vNull, vNull);

  /* Zero out the storage vector */
  vTemp = _mm_adds_epi16 (vNull, vGapOpen);
  for (i = 0; i < iter; i++) {
    _mm_store_si128 (pvH + i, vTemp);
    _mm_store_si128 (pvE + i, vNull);

  /* initialize F */
  vF = vNull;
  vFPrev = vNull;

  /* load and scale H for the next round */
  vTemp = _mm_srli_si128 (vGapOpen, 14);
  vH = _mm_load_si128 (pvH + iter - 1);
  vH = _mm_adds_epi16 (vH, vTemp);

  for (i = 0; i < dbLength; ++i) {
    /* fetch first data asap. */
    pvScore = (__m128i *) profile + dbSeq[i] * iter;

    vF = vNull;

    vH = _mm_max_epi16 (vH, vFPrev);
    for (j = 0; j < iter; j++) {
      /* correct H from the previous columns F */
      vHNext = _mm_load_si128 (pvH + j);
      vHNext = _mm_max_epi16 (vHNext, vFPrev);

      /* load and correct E value */
      vE = _mm_load_si128 (pvE + j);
      vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
      vE = _mm_max_epi16 (vE, vTemp);
      _mm_store_si128 (pvE + j, vE);

      /* add score to vH */
      vH = _mm_adds_epi16 (vH, *pvScore++);

      /* get max from vH, vE and vF */
      vH = _mm_max_epi16 (vH, vE);
      vH = _mm_max_epi16 (vH, vF);
      _mm_store_si128 (pvH + j, vH);

      /* update vF value */
      vH = _mm_subs_epi16 (vH, vGapOpen);
      vF = _mm_max_epi16 (vF, vH);

      /* load the next h values */
      vH = vHNext;

    /* check if we need to scale before the next round */
    vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
    cmp  = _mm_movemask_epi8 (vTemp);

    /* broadcast F values */
    vF = _mm_xor_si128 (vF, vNull);

    vTemp  = _mm_slli_si128 (vF, 2);
    vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
    vF = max_epu16 (vF, vTemp);

    vTemp  = _mm_slli_si128 (vF, 4);
    vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 4);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 8);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    /* scale if necessary */
    if (cmp != 0x0000) {
      __m128i vScale1;
      __m128i vScale2;

      vScale = _mm_slli_si128 (vF, 2);
      vScale = _mm_subs_epu16 (vScale, vGapOpen);
      vScale = _mm_subs_epu16 (vScale, vScaleAmt);

      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vScale, vTemp);
      vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vTemp, vScale);
      vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);

      /* rescale the previous F */
      vF = _mm_subs_epu16 (vF, vScale);

      /* check if we can continue in signed 16-bits */
      vTemp = _mm_xor_si128 (vF, vNull);
      vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
      cmp  = _mm_movemask_epi8 (vTemp);
      if (cmp != 0x0000) {
        return OVERFLOW_SCORE;

      vTemp   = _mm_adds_epi16 (vCeiling, vCeiling);
      vScale1 = _mm_subs_epu16 (vScale, vTemp);
      vScale2 = _mm_subs_epu16 (vScale, vScale1);

      /* scale all the vectors */
      for (j = 0; j < iter; j++) {
        /* load H and E */
        vH = _mm_load_si128 (pvH + j);
        vE = _mm_load_si128 (pvE + j);

        /* get max from vH, vE and vF */
        vH = _mm_subs_epi16 (vH, vScale1);
        vH = _mm_subs_epi16 (vH, vScale2);
        vE = _mm_subs_epi16 (vE, vScale1);
        vE = _mm_subs_epi16 (vE, vScale2);

        /* save the H and E */
        _mm_store_si128 (pvH + j, vH);
        _mm_store_si128 (pvE + j, vE);

      vScale = vScaleAmt;
      for (j = 0; j < position; ++j) {
        vScale = _mm_slli_si128 (vScale, 2);

      /* calculate the final scaling amount */
      vTemp   = _mm_xor_si128 (vTemp, vTemp);
      vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
      vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
      vScale  = _mm_add_epi32 (vScale1, vScale2);
      vTemp = _mm_srli_si128 (vScale, 8);
      vScale = _mm_add_epi32 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 4);
      vScale = _mm_add_epi32 (vScale, vTemp);
      scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
      temp  = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
      scale = scale + (temp << 16);

    /* scale the F value for the next round */
    vFPrev = _mm_slli_si128 (vF, 2);
    vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
    vFPrev = _mm_xor_si128 (vFPrev, vNull);

    /* load and scale H for the next round */
    vH = _mm_load_si128 (pvH + iter - 1);
    vH = _mm_xor_si128 (vH, vNull);
    vH = _mm_slli_si128 (vH, 2);
    vH = _mm_subs_epu16 (vH, vScaleAmt);
    vH = _mm_insert_epi16 (vH, gapOpen, 0);
    vH = _mm_xor_si128 (vH, vNull);

  vH = _mm_load_si128 (pvH + offset);
  vH = _mm_max_epi16 (vH, vFPrev);
  for (j = 0; j < position; ++j) {
    vH = _mm_slli_si128 (vH, 2);
  score = (int) (signed short) _mm_extract_epi16 (vH, 7);
  score = score + SHORT_BIAS;

  /* return largest score */
  distance = (queryLength + dbLength) * gapExtend;
  score = score - (gapOpen * 2) - distance + scale;

  return score;
Exemple #8
static LW_FORCEINLINE void fill_rgb_buffer_sse41( BYTE *rgb_buffer, BYTE *lw48_ptr )
    static const USHORT LW_ALIGN(16) PW_32768[8]       = { 32768, 32768, 32768, 32768, 32768, 32768, 32768, 32768 };
    static const short  LW_ALIGN(16) PW_28672[8]       = { 28672, 28672, 28672, 28672, 28672, 28672, 28672, 28672 };
    static const short  LW_ALIGN(16) PW_9539[8]        = {  9539,  9539,  9539,  9539,  9539,  9539,  9539,  9539 };
    static const short  LW_ALIGN(16) PW_13074[8]       = { 13074, 13074, 13074, 13074, 13074, 13074, 13074, 13074 };
    static const short  LW_ALIGN(16) PW_16531[8]       = { 16531, 16531, 16531, 16531, 16531, 16531, 16531, 16531 };
    static const short  LW_ALIGN(16) PW_M3203_M6808[8] = { -3203, -6808, -3203, -6808, -3203, -6808, -3203, -6808 };
    static const int    LW_ALIGN(16) PD_1_20[4]        = { (1<<20), (1<<20), (1<<20), (1<<20) };
    static const char   LW_ALIGN(16) LW48_SHUFFLE[3][16] = {
        { 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15, 4, 5, 10, 11 },
        { 2, 3, 8, 9, 14, 15, 4, 5, 10, 11, 0, 1, 6, 7, 12, 13 },
        { 4, 5, 10, 11, 0, 1, 6, 7, 12, 13, 2, 3, 8, 9, 14, 15 }
    __m128i x0, x1, x2, x3, x4, x5, x6, x7;
    x5 = _mm_loadu_si128((__m128i *)(lw48_ptr +  0));
    x6 = _mm_loadu_si128((__m128i *)(lw48_ptr + 16));
    x7 = _mm_loadu_si128((__m128i *)(lw48_ptr + 32));

    x0 = _mm_blend_epi16(x5, x6, 0x80+0x10+0x02);
    x0 = _mm_blend_epi16(x0, x7, 0x20+0x04);

    x1 = _mm_blend_epi16(x5, x6, 0x20+0x04);
    x1 = _mm_blend_epi16(x1, x7, 0x40+0x08+0x01);

    x2 = _mm_blend_epi16(x5, x6, 0x40+0x08+0x01);
    x2 = _mm_blend_epi16(x2, x7, 0x80+0x10+0x02);

    x0 = _mm_shuffle_epi8(x0, _mm_load_si128((__m128i*)LW48_SHUFFLE[0])); /* Y  */
    x1 = _mm_shuffle_epi8(x1, _mm_load_si128((__m128i*)LW48_SHUFFLE[1])); /* Cb */
    x2 = _mm_shuffle_epi8(x2, _mm_load_si128((__m128i*)LW48_SHUFFLE[2])); /* Cr */

    x0 = _mm_sub_epi16(x0, _mm_load_si128((__m128i*)PW_32768));
    x1 = _mm_sub_epi16(x1, _mm_load_si128((__m128i*)PW_32768));
    x2 = _mm_sub_epi16(x2, _mm_load_si128((__m128i*)PW_32768));

    /* y_tmp = ((y - 4096) * 9539) */
    /*       = ((y - 32768) + (32768 - 4096)) * 9539 */
    /*       = ((y - 32768) * 9539 + 28672 * 9539 */
    x3 = _mm_unpacklo_epi16(x0, _mm_load_si128((__m128i*)PW_28672));
    x4 = _mm_unpackhi_epi16(x0, _mm_load_si128((__m128i*)PW_28672));
    x3 = _mm_madd_epi16(x3, _mm_load_si128((__m128i*)PW_9539));
    x4 = _mm_madd_epi16(x4, _mm_load_si128((__m128i*)PW_9539));

    /* G = ((y_tmp + ((cb-32768) * -3203) + ((cr-32768) * -6808)) + (1<<20)) >> 21 */
    x5 = _mm_unpacklo_epi16(x1, x2);
    x6 = _mm_unpackhi_epi16(x1, x2);
    x5 = _mm_madd_epi16(x5, _mm_load_si128((__m128i*)PW_M3203_M6808));
    x6 = _mm_madd_epi16(x6, _mm_load_si128((__m128i*)PW_M3203_M6808));
    x5 = _mm_add_epi32(x5, x3);
    x6 = _mm_add_epi32(x6, x4);
    x5 = _mm_add_epi32(x5, _mm_load_si128((__m128i*)PD_1_20));
    x6 = _mm_add_epi32(x6, _mm_load_si128((__m128i*)PD_1_20));
    x5 = _mm_srai_epi32(x5, 21);
    x6 = _mm_srai_epi32(x6, 21);
    x5 = _mm_packs_epi32(x5, x6);
    _mm_store_si128((__m128i*)(rgb_buffer + 16), x5);

    /* R = ((y_tmp + ((cr-32768) * 13074) + (1<<20)) >> 21 */
    x0 = _mm_mullo_epi16(x2, _mm_load_si128((__m128i*)PW_13074));
    x7 = _mm_mulhi_epi16(x2, _mm_load_si128((__m128i*)PW_13074));
    x6 = _mm_unpacklo_epi16(x0, x7);
    x7 = _mm_unpackhi_epi16(x0, x7);
    x6 = _mm_add_epi32(x6, x3);
    x7 = _mm_add_epi32(x7, x4);
    x6 = _mm_add_epi32(x6, _mm_load_si128((__m128i*)PD_1_20));
    x7 = _mm_add_epi32(x7, _mm_load_si128((__m128i*)PD_1_20));
    x6 = _mm_srai_epi32(x6, 21);
    x7 = _mm_srai_epi32(x7, 21);
    x6 = _mm_packs_epi32(x6, x7);
    _mm_store_si128((__m128i*)(rgb_buffer + 32), x6);

    /* B = ((y_tmp + ((cb-32768) * 16531) + (1<<20)) >> 21 */
    x2 = _mm_mullo_epi16(x1, _mm_load_si128((__m128i*)PW_16531));
    x7 = _mm_mulhi_epi16(x1, _mm_load_si128((__m128i*)PW_16531));
    x0 = _mm_unpacklo_epi16(x2, x7);
    x7 = _mm_unpackhi_epi16(x2, x7);
    x0 = _mm_add_epi32(x0, x3);
    x7 = _mm_add_epi32(x7, x4);
    x0 = _mm_add_epi32(x0, _mm_load_si128((__m128i*)PD_1_20));
    x7 = _mm_add_epi32(x7, _mm_load_si128((__m128i*)PD_1_20));
    x0 = _mm_srai_epi32(x0, 21);
    x7 = _mm_srai_epi32(x7, 21);
    x7 = _mm_packs_epi32(x0, x7);
    _mm_store_si128((__m128i*)(rgb_buffer +  0), x7);
void tuned_ConvertULY4ToRGB(uint8_t *pDstBegin, uint8_t *pDstEnd, const uint8_t *pYBegin, const uint8_t *pUBegin, const uint8_t *pVBegin, size_t cbWidth, ssize_t scbStride)
	const int shift = 13;

	__m128i xy2rgb = _mm_set2_epi16_shift((-16 * C::Y2RGB + 0.5) / 0xff, C::Y2RGB, shift);
	__m128i vu2r = _mm_set2_epi16_shift(C::V2R, 0, shift);
	__m128i vu2g = _mm_set2_epi16_shift(C::V2G, C::U2G, shift);
	__m128i vu2b = _mm_set2_epi16_shift(0, C::U2B, shift);

	auto y = pYBegin;
	auto u = pUBegin;
	auto v = pVBegin;

	for (auto p = pDstBegin; p != pDstEnd; p += scbStride)
		auto pp = p;

		for (; pp <= p + cbWidth - 16; pp += T::BYPP * 4)
			__m128i yy = _mm_cvtsi32_si128(*(const int *)y);
			__m128i uu = _mm_cvtsi32_si128(*(const int *)u);
			__m128i vv = _mm_cvtsi32_si128(*(const int *)v);

			__m128i xy = _mm_unpacklo_epi8(_mm_unpacklo_epi8(yy, _mm_setone_si128()), _mm_setzero_si128()); // 00 ff 00 Y3 00 ff 00 Y2 00 ff 00 Y1 00 ff 00 Y0
			__m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0
			vu = _mm_sub_epi16(vu, _mm_set1_epi16(128));

			__m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb);

			auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i {
				__m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb));
				rgb = _mm_srai_epi32(rgb, shift);
				rgb = _mm_packs_epi32(rgb, rgb);
				rgb = _mm_packus_epi16(rgb, rgb);
				return rgb;
			__m128i rr = xyuv2rgb(vu2r);
			__m128i gg = xyuv2rgb(vu2g);
			__m128i bb = xyuv2rgb(vu2b);

			if (std::is_same<T, CBGRAColorOrder>::value)
				__m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128()));
				_mm_storeu_si128((__m128i *)pp, bgrx);
#ifdef __SSSE3__
			else if (std::is_same<T, CBGRColorOrder>::value)
				__m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, rr));
				__m128i bgr = _mm_shuffle_epi8(bgrx, _mm_set_epi8(-1, -1, -1, -1, 14, 13, 12, 10, 9, 8, 6, 5, 4, 2, 1, 0));
				_mm_storeu_si128((__m128i *)pp, bgr);
			else if (std::is_same<T, CARGBColorOrder>::value)
				__m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb));
				_mm_storeu_si128((__m128i *)pp, xrgb);
#ifdef __SSSE3__
			else if (std::is_same<T, CRGBColorOrder>::value)
				__m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(_mm_setone_si128(), rr), _mm_unpacklo_epi8(gg, bb));
				__m128i rgb = _mm_shuffle_epi8(xrgb, _mm_set_epi8(-1, -1, -1, -1, 15, 14, 13, 11, 10, 9, 7, 6, 5, 3, 2, 1));
				_mm_storeu_si128((__m128i *)pp, rgb);

			y += 4;
			u += 4;
			v += 4;

		for (; pp < p + cbWidth; pp += T::BYPP)
			__m128i xy = _mm_cvtsi32_si128(*y | 0x00ff0000);
			__m128i uu = _mm_cvtsi32_si128(*u);
			__m128i vv = _mm_cvtsi32_si128(*v);

			__m128i vu = _mm_unpacklo_epi8(_mm_unpacklo_epi8(uu, vv), _mm_setzero_si128()); // 00 V3 00 U3 00 V2 00 U2 00 V1 00 U1 00 V0 00 U0
			vu = _mm_sub_epi16(vu, _mm_set1_epi16(128));

			__m128i rgbtmp = _mm_madd_epi16(xy, xy2rgb);

			auto xyuv2rgb = [rgbtmp, vu, shift](__m128i vu2rgb) -> __m128i {
				__m128i rgb = _mm_add_epi32(rgbtmp, _mm_madd_epi16(vu, vu2rgb));
				rgb = _mm_srai_epi32(rgb, shift);
				rgb = _mm_packs_epi32(rgb, rgb);
				rgb = _mm_packus_epi16(rgb, rgb);
				return rgb;
			__m128i rr = xyuv2rgb(vu2r);
			__m128i gg = xyuv2rgb(vu2g);
			__m128i bb = xyuv2rgb(vu2b);

			if (std::is_same<T, CBGRAColorOrder>::value)
				__m128i bgrx = _mm_unpacklo_epi16(_mm_unpacklo_epi8(bb, gg), _mm_unpacklo_epi8(rr, _mm_setone_si128()));
				*(uint32_t *)pp = _mm_cvtsi128_si32(bgrx);
			else if (std::is_same<T, CARGBColorOrder>::value)
				__m128i xrgb = _mm_unpacklo_epi16(_mm_unpacklo_epi8(rr, rr), _mm_unpacklo_epi8(gg, bb));
				*(uint32_t *)pp = _mm_cvtsi128_si32(xrgb);
			else if (std::is_same<T, CBGRColorOrder>::value || std::is_same<T, CRGBColorOrder>::value)
				*(pp + T::B) = (uint8_t)_mm_cvtsi128_si32(bb);
				*(pp + T::G) = (uint8_t)_mm_cvtsi128_si32(gg);
				*(pp + T::R) = (uint8_t)_mm_cvtsi128_si32(rr);

			y += 1;
			u += 1;
			v += 1;
void vp9_short_fdct4x4_sse2(int16_t *input, int16_t *output, int pitch) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  const int stride = pitch >> 1;
  int pass;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i k__nonzero_bias_a = _mm_setr_epi16(0, 1, 1, 1, 1, 1, 1, 1);
  const __m128i k__nonzero_bias_b = _mm_setr_epi16(1, 0, 0, 0, 0, 0, 0, 0);
  const __m128i kOne = _mm_set1_epi16(1);
  __m128i in0, in1, in2, in3;
  // Load inputs.
    in0  = _mm_loadl_epi64((const __m128i *)(input +  0 * stride));
    in1  = _mm_loadl_epi64((const __m128i *)(input +  1 * stride));
    in2  = _mm_loadl_epi64((const __m128i *)(input +  2 * stride));
    in3  = _mm_loadl_epi64((const __m128i *)(input +  3 * stride));
    // x = x << 4
    in0 = _mm_slli_epi16(in0, 4);
    in1 = _mm_slli_epi16(in1, 4);
    in2 = _mm_slli_epi16(in2, 4);
    in3 = _mm_slli_epi16(in3, 4);
    // if (i == 0 && input[0]) input[0] += 1;
      // The mask will only contain wether the first value is zero, all
      // other comparison will fail as something shifted by 4 (above << 4)
      // can never be equal to one. To increment in the non-zero case, we
      // add the mask and one for the first element:
      //   - if zero, mask = -1, v = v - 1 + 1 = v
      //   - if non-zero, mask = 0, v = v + 0 + 1 = v + 1
      __m128i mask = _mm_cmpeq_epi16(in0, k__nonzero_bias_a);
      in0 = _mm_add_epi16(in0, mask);
      in0 = _mm_add_epi16(in0, k__nonzero_bias_b);
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    // Transform 1/2: Add/substract
    const __m128i r0 = _mm_add_epi16(in0, in3);
    const __m128i r1 = _mm_add_epi16(in1, in2);
    const __m128i r2 = _mm_sub_epi16(in1, in2);
    const __m128i r3 = _mm_sub_epi16(in0, in3);
    // Transform 1/2: Interleave to do the multiply by constants which gets us
    //                into 32 bits.
    const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
    const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
    const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
    const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
    const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
    const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
    const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
    const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
    const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
    const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
    const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
    const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
    const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
    const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
    // Combine and transpose
    const __m128i res0 = _mm_packs_epi32(w0, w2);
    const __m128i res1 = _mm_packs_epi32(w4, w6);
    // 00 01 02 03 20 21 22 23
    // 10 11 12 13 30 31 32 33
    const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
    const __m128i tr0_1 = _mm_unpackhi_epi16(res0, res1);
    // 00 10 01 11 02 12 03 13
    // 20 30 21 31 22 32 23 33
    in0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
    in2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
    // 00 10 20 30 01 11 21 31      in0 contains 0 followed by 1
    // 02 12 22 32 03 13 23 33      in2 contains 2 followed by 3
    if (0 == pass) {
      // Extract values in the high part for second pass as transform code
      // only uses the first four values.
      in1 = _mm_unpackhi_epi64(in0, in0);
      in3 = _mm_unpackhi_epi64(in2, in2);
    } else {
      // Post-condition output and store it (v + 1) >> 2, taking advantage
      // of the fact 1/3 are stored just after 0/2.
      __m128i out01 = _mm_add_epi16(in0, kOne);
      __m128i out23 = _mm_add_epi16(in2, kOne);
      out01 = _mm_srai_epi16(out01, 2);
      out23 = _mm_srai_epi16(out23, 2);
      _mm_storeu_si128((__m128i *)(output + 0 * 4), out01);
      _mm_storeu_si128((__m128i *)(output + 2 * 4), out23);
void vp9_short_fdct16x16_sse2(int16_t *input, int16_t *output, int pitch) {
  // The 2D transform is done with two passes which are actually pretty
  // similar. In the first one, we transform the columns and transpose
  // the results. In the second one, we transform the rows. To achieve that,
  // as the first pass results are transposed, we tranpose the columns (that
  // is the transposed rows) and transpose the results (so that it goes back
  // in normal/row positions).
  const int stride = pitch >> 1;
  int pass;
  // We need an intermediate buffer between passes.
  int16_t intermediate[256];
  int16_t *in = input;
  int16_t *out = intermediate;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m24_m08 = pair_set_epi16(-cospi_24_64, -cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  const __m128i k__cospi_p30_p02 = pair_set_epi16(cospi_30_64, cospi_2_64);
  const __m128i k__cospi_p14_p18 = pair_set_epi16(cospi_14_64, cospi_18_64);
  const __m128i k__cospi_m02_p30 = pair_set_epi16(-cospi_2_64, cospi_30_64);
  const __m128i k__cospi_m18_p14 = pair_set_epi16(-cospi_18_64, cospi_14_64);
  const __m128i k__cospi_p22_p10 = pair_set_epi16(cospi_22_64, cospi_10_64);
  const __m128i k__cospi_p06_p26 = pair_set_epi16(cospi_6_64, cospi_26_64);
  const __m128i k__cospi_m10_p22 = pair_set_epi16(-cospi_10_64, cospi_22_64);
  const __m128i k__cospi_m26_p06 = pair_set_epi16(-cospi_26_64, cospi_6_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  const __m128i kOne = _mm_set1_epi16(1);
  // Do the two transform/transpose passes
  for (pass = 0; pass < 2; ++pass) {
    // We process eight columns (transposed rows in second pass) at a time.
    int column_start;
    for (column_start = 0; column_start < 16; column_start += 8) {
      __m128i in00, in01, in02, in03, in04, in05, in06, in07;
      __m128i in08, in09, in10, in11, in12, in13, in14, in15;
      __m128i input0, input1, input2, input3, input4, input5, input6, input7;
      __m128i step1_0, step1_1, step1_2, step1_3;
      __m128i step1_4, step1_5, step1_6, step1_7;
      __m128i step2_1, step2_2, step2_3, step2_4, step2_5, step2_6;
      __m128i step3_0, step3_1, step3_2, step3_3;
      __m128i step3_4, step3_5, step3_6, step3_7;
      __m128i res00, res01, res02, res03, res04, res05, res06, res07;
      __m128i res08, res09, res10, res11, res12, res13, res14, res15;
      // Load and pre-condition input.
      if (0 == pass) {
        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * stride));
        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * stride));
        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * stride));
        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * stride));
        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * stride));
        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * stride));
        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * stride));
        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * stride));
        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * stride));
        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * stride));
        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * stride));
        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * stride));
        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * stride));
        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * stride));
        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * stride));
        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * stride));
        // x = x << 2
        in00 = _mm_slli_epi16(in00, 2);
        in01 = _mm_slli_epi16(in01, 2);
        in02 = _mm_slli_epi16(in02, 2);
        in03 = _mm_slli_epi16(in03, 2);
        in04 = _mm_slli_epi16(in04, 2);
        in05 = _mm_slli_epi16(in05, 2);
        in06 = _mm_slli_epi16(in06, 2);
        in07 = _mm_slli_epi16(in07, 2);
        in08 = _mm_slli_epi16(in08, 2);
        in09 = _mm_slli_epi16(in09, 2);
        in10 = _mm_slli_epi16(in10, 2);
        in11 = _mm_slli_epi16(in11, 2);
        in12 = _mm_slli_epi16(in12, 2);
        in13 = _mm_slli_epi16(in13, 2);
        in14 = _mm_slli_epi16(in14, 2);
        in15 = _mm_slli_epi16(in15, 2);
      } else {
        in00  = _mm_loadu_si128((const __m128i *)(in +  0 * 16));
        in01  = _mm_loadu_si128((const __m128i *)(in +  1 * 16));
        in02  = _mm_loadu_si128((const __m128i *)(in +  2 * 16));
        in03  = _mm_loadu_si128((const __m128i *)(in +  3 * 16));
        in04  = _mm_loadu_si128((const __m128i *)(in +  4 * 16));
        in05  = _mm_loadu_si128((const __m128i *)(in +  5 * 16));
        in06  = _mm_loadu_si128((const __m128i *)(in +  6 * 16));
        in07  = _mm_loadu_si128((const __m128i *)(in +  7 * 16));
        in08  = _mm_loadu_si128((const __m128i *)(in +  8 * 16));
        in09  = _mm_loadu_si128((const __m128i *)(in +  9 * 16));
        in10  = _mm_loadu_si128((const __m128i *)(in + 10 * 16));
        in11  = _mm_loadu_si128((const __m128i *)(in + 11 * 16));
        in12  = _mm_loadu_si128((const __m128i *)(in + 12 * 16));
        in13  = _mm_loadu_si128((const __m128i *)(in + 13 * 16));
        in14  = _mm_loadu_si128((const __m128i *)(in + 14 * 16));
        in15  = _mm_loadu_si128((const __m128i *)(in + 15 * 16));
        // x = (x + 1) >> 2
        in00 = _mm_add_epi16(in00, kOne);
        in01 = _mm_add_epi16(in01, kOne);
        in02 = _mm_add_epi16(in02, kOne);
        in03 = _mm_add_epi16(in03, kOne);
        in04 = _mm_add_epi16(in04, kOne);
        in05 = _mm_add_epi16(in05, kOne);
        in06 = _mm_add_epi16(in06, kOne);
        in07 = _mm_add_epi16(in07, kOne);
        in08 = _mm_add_epi16(in08, kOne);
        in09 = _mm_add_epi16(in09, kOne);
        in10 = _mm_add_epi16(in10, kOne);
        in11 = _mm_add_epi16(in11, kOne);
        in12 = _mm_add_epi16(in12, kOne);
        in13 = _mm_add_epi16(in13, kOne);
        in14 = _mm_add_epi16(in14, kOne);
        in15 = _mm_add_epi16(in15, kOne);
        in00 = _mm_srai_epi16(in00, 2);
        in01 = _mm_srai_epi16(in01, 2);
        in02 = _mm_srai_epi16(in02, 2);
        in03 = _mm_srai_epi16(in03, 2);
        in04 = _mm_srai_epi16(in04, 2);
        in05 = _mm_srai_epi16(in05, 2);
        in06 = _mm_srai_epi16(in06, 2);
        in07 = _mm_srai_epi16(in07, 2);
        in08 = _mm_srai_epi16(in08, 2);
        in09 = _mm_srai_epi16(in09, 2);
        in10 = _mm_srai_epi16(in10, 2);
        in11 = _mm_srai_epi16(in11, 2);
        in12 = _mm_srai_epi16(in12, 2);
        in13 = _mm_srai_epi16(in13, 2);
        in14 = _mm_srai_epi16(in14, 2);
        in15 = _mm_srai_epi16(in15, 2);
      in += 8;
      // Calculate input for the first 8 results.
        input0 = _mm_add_epi16(in00, in15);
        input1 = _mm_add_epi16(in01, in14);
        input2 = _mm_add_epi16(in02, in13);
        input3 = _mm_add_epi16(in03, in12);
        input4 = _mm_add_epi16(in04, in11);
        input5 = _mm_add_epi16(in05, in10);
        input6 = _mm_add_epi16(in06, in09);
        input7 = _mm_add_epi16(in07, in08);
      // Calculate input for the next 8 results.
        step1_0 = _mm_sub_epi16(in07, in08);
        step1_1 = _mm_sub_epi16(in06, in09);
        step1_2 = _mm_sub_epi16(in05, in10);
        step1_3 = _mm_sub_epi16(in04, in11);
        step1_4 = _mm_sub_epi16(in03, in12);
        step1_5 = _mm_sub_epi16(in02, in13);
        step1_6 = _mm_sub_epi16(in01, in14);
        step1_7 = _mm_sub_epi16(in00, in15);
      // Work on the first eight values; fdct8_1d(input, even_results);
        // Add/substract
        const __m128i q0 = _mm_add_epi16(input0, input7);
        const __m128i q1 = _mm_add_epi16(input1, input6);
        const __m128i q2 = _mm_add_epi16(input2, input5);
        const __m128i q3 = _mm_add_epi16(input3, input4);
        const __m128i q4 = _mm_sub_epi16(input3, input4);
        const __m128i q5 = _mm_sub_epi16(input2, input5);
        const __m128i q6 = _mm_sub_epi16(input1, input6);
        const __m128i q7 = _mm_sub_epi16(input0, input7);
        // Work on first four results
          // Add/substract
          const __m128i r0 = _mm_add_epi16(q0, q3);
          const __m128i r1 = _mm_add_epi16(q1, q2);
          const __m128i r2 = _mm_sub_epi16(q1, q2);
          const __m128i r3 = _mm_sub_epi16(q0, q3);
          // Interleave to do the multiply by constants which gets us
          // into 32 bits.
          const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
          const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
          const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
          const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
          // Combine
          res00 = _mm_packs_epi32(w0, w1);
          res08 = _mm_packs_epi32(w2, w3);
          res04 = _mm_packs_epi32(w4, w5);
          res12 = _mm_packs_epi32(w6, w7);
        // Work on next four results
          // Interleave to do the multiply by constants which gets us
          // into 32 bits.
          const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
          const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
          const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
          const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
          const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
          const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
          // dct_const_round_shift
          const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
          const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
          const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
          const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
          const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
          const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
          const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
          const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
          // Combine
          const __m128i r0 = _mm_packs_epi32(s0, s1);
          const __m128i r1 = _mm_packs_epi32(s2, s3);
          // Add/substract
          const __m128i x0 = _mm_add_epi16(q4, r0);
          const __m128i x1 = _mm_sub_epi16(q4, r0);
          const __m128i x2 = _mm_sub_epi16(q7, r1);
          const __m128i x3 = _mm_add_epi16(q7, r1);
          // Interleave to do the multiply by constants which gets us
          // into 32 bits.
          const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
          const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
          const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
          const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
          const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
          const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
          const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
          const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
          const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
          const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
          const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
          const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
          const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
          const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
          const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
          const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
          // Combine
          res02 = _mm_packs_epi32(w0, w1);
          res14 = _mm_packs_epi32(w2, w3);
          res10 = _mm_packs_epi32(w4, w5);
          res06 = _mm_packs_epi32(w6, w7);
      // Work on the next eight values; step1 -> odd_results
        // step 2
          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_m16);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_m16);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_m16);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_m16);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_2 = _mm_packs_epi32(w0, w1);
          step2_3 = _mm_packs_epi32(w2, w3);
          const __m128i t0 = _mm_unpacklo_epi16(step1_5, step1_2);
          const __m128i t1 = _mm_unpackhi_epi16(step1_5, step1_2);
          const __m128i t2 = _mm_unpacklo_epi16(step1_4, step1_3);
          const __m128i t3 = _mm_unpackhi_epi16(step1_4, step1_3);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p16_p16);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p16_p16);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_5 = _mm_packs_epi32(w0, w1);
          step2_4 = _mm_packs_epi32(w2, w3);
        // step 3
          step3_0 = _mm_add_epi16(step1_0, step2_3);
          step3_1 = _mm_add_epi16(step1_1, step2_2);
          step3_2 = _mm_sub_epi16(step1_1, step2_2);
          step3_3 = _mm_sub_epi16(step1_0, step2_3);
          step3_4 = _mm_sub_epi16(step1_7, step2_4);
          step3_5 = _mm_sub_epi16(step1_6, step2_5);
          step3_6 = _mm_add_epi16(step1_6, step2_5);
          step3_7 = _mm_add_epi16(step1_7, step2_4);
        // step 4
          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m08_p24);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m08_p24);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m24_m08);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m24_m08);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_1 = _mm_packs_epi32(w0, w1);
          step2_2 = _mm_packs_epi32(w2, w3);
          const __m128i t0 = _mm_unpacklo_epi16(step3_1, step3_6);
          const __m128i t1 = _mm_unpackhi_epi16(step3_1, step3_6);
          const __m128i t2 = _mm_unpacklo_epi16(step3_2, step3_5);
          const __m128i t3 = _mm_unpackhi_epi16(step3_2, step3_5);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p24_p08);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p24_p08);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m08_p24);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m08_p24);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          step2_6 = _mm_packs_epi32(w0, w1);
          step2_5 = _mm_packs_epi32(w2, w3);
        // step 5
          step1_0 = _mm_add_epi16(step3_0, step2_1);
          step1_1 = _mm_sub_epi16(step3_0, step2_1);
          step1_2 = _mm_sub_epi16(step3_3, step2_2);
          step1_3 = _mm_add_epi16(step3_3, step2_2);
          step1_4 = _mm_add_epi16(step3_4, step2_5);
          step1_5 = _mm_sub_epi16(step3_4, step2_5);
          step1_6 = _mm_sub_epi16(step3_7, step2_6);
          step1_7 = _mm_add_epi16(step3_7, step2_6);
        // step 6
          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p30_p02);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p30_p02);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p14_p18);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p14_p18);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res01 = _mm_packs_epi32(w0, w1);
          res09 = _mm_packs_epi32(w2, w3);
          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p22_p10);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p22_p10);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_p06_p26);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_p06_p26);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res05 = _mm_packs_epi32(w0, w1);
          res13 = _mm_packs_epi32(w2, w3);
          const __m128i t0 = _mm_unpacklo_epi16(step1_2, step1_5);
          const __m128i t1 = _mm_unpackhi_epi16(step1_2, step1_5);
          const __m128i t2 = _mm_unpacklo_epi16(step1_3, step1_4);
          const __m128i t3 = _mm_unpackhi_epi16(step1_3, step1_4);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m10_p22);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m10_p22);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m26_p06);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m26_p06);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res11 = _mm_packs_epi32(w0, w1);
          res03 = _mm_packs_epi32(w2, w3);
          const __m128i t0 = _mm_unpacklo_epi16(step1_0, step1_7);
          const __m128i t1 = _mm_unpackhi_epi16(step1_0, step1_7);
          const __m128i t2 = _mm_unpacklo_epi16(step1_1, step1_6);
          const __m128i t3 = _mm_unpackhi_epi16(step1_1, step1_6);
          const __m128i u0 = _mm_madd_epi16(t0, k__cospi_m02_p30);
          const __m128i u1 = _mm_madd_epi16(t1, k__cospi_m02_p30);
          const __m128i u2 = _mm_madd_epi16(t2, k__cospi_m18_p14);
          const __m128i u3 = _mm_madd_epi16(t3, k__cospi_m18_p14);
          // dct_const_round_shift
          const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
          const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
          const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
          const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
          const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
          const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
          const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
          const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
          // Combine
          res15 = _mm_packs_epi32(w0, w1);
          res07 = _mm_packs_epi32(w2, w3);
      // Transpose the results, do it as two 8x8 transposes.
        // 00 01 02 03 04 05 06 07
        // 10 11 12 13 14 15 16 17
        // 20 21 22 23 24 25 26 27
        // 30 31 32 33 34 35 36 37
        // 40 41 42 43 44 45 46 47
        // 50 51 52 53 54 55 56 57
        // 60 61 62 63 64 65 66 67
        // 70 71 72 73 74 75 76 77
        const __m128i tr0_0 = _mm_unpacklo_epi16(res00, res01);
        const __m128i tr0_1 = _mm_unpacklo_epi16(res02, res03);
        const __m128i tr0_2 = _mm_unpackhi_epi16(res00, res01);
        const __m128i tr0_3 = _mm_unpackhi_epi16(res02, res03);
        const __m128i tr0_4 = _mm_unpacklo_epi16(res04, res05);
        const __m128i tr0_5 = _mm_unpacklo_epi16(res06, res07);
        const __m128i tr0_6 = _mm_unpackhi_epi16(res04, res05);
        const __m128i tr0_7 = _mm_unpackhi_epi16(res06, res07);
        // 00 10 01 11 02 12 03 13
        // 20 30 21 31 22 32 23 33
        // 04 14 05 15 06 16 07 17
        // 24 34 25 35 26 36 27 37
        // 40 50 41 51 42 52 43 53
        // 60 70 61 71 62 72 63 73
        // 54 54 55 55 56 56 57 57
        // 64 74 65 75 66 76 67 77
        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
        // 00 10 20 30 01 11 21 31
        // 40 50 60 70 41 51 61 71
        // 02 12 22 32 03 13 23 33
        // 42 52 62 72 43 53 63 73
        // 04 14 24 34 05 15 21 36
        // 44 54 64 74 45 55 61 76
        // 06 16 26 36 07 17 27 37
        // 46 56 66 76 47 57 67 77
        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
        // 00 10 20 30 40 50 60 70
        // 01 11 21 31 41 51 61 71
        // 02 12 22 32 42 52 62 72
        // 03 13 23 33 43 53 63 73
        // 04 14 24 34 44 54 64 74
        // 05 15 25 35 45 55 65 75
        // 06 16 26 36 46 56 66 76
        // 07 17 27 37 47 57 67 77
        _mm_storeu_si128((__m128i *)(out + 0 * 16), tr2_0);
        _mm_storeu_si128((__m128i *)(out + 1 * 16), tr2_1);
        _mm_storeu_si128((__m128i *)(out + 2 * 16), tr2_2);
        _mm_storeu_si128((__m128i *)(out + 3 * 16), tr2_3);
        _mm_storeu_si128((__m128i *)(out + 4 * 16), tr2_4);
        _mm_storeu_si128((__m128i *)(out + 5 * 16), tr2_5);
        _mm_storeu_si128((__m128i *)(out + 6 * 16), tr2_6);
        _mm_storeu_si128((__m128i *)(out + 7 * 16), tr2_7);
        // 00 01 02 03 04 05 06 07
        // 10 11 12 13 14 15 16 17
        // 20 21 22 23 24 25 26 27
        // 30 31 32 33 34 35 36 37
        // 40 41 42 43 44 45 46 47
        // 50 51 52 53 54 55 56 57
        // 60 61 62 63 64 65 66 67
        // 70 71 72 73 74 75 76 77
        const __m128i tr0_0 = _mm_unpacklo_epi16(res08, res09);
        const __m128i tr0_1 = _mm_unpacklo_epi16(res10, res11);
        const __m128i tr0_2 = _mm_unpackhi_epi16(res08, res09);
        const __m128i tr0_3 = _mm_unpackhi_epi16(res10, res11);
        const __m128i tr0_4 = _mm_unpacklo_epi16(res12, res13);
        const __m128i tr0_5 = _mm_unpacklo_epi16(res14, res15);
        const __m128i tr0_6 = _mm_unpackhi_epi16(res12, res13);
        const __m128i tr0_7 = _mm_unpackhi_epi16(res14, res15);
        // 00 10 01 11 02 12 03 13
        // 20 30 21 31 22 32 23 33
        // 04 14 05 15 06 16 07 17
        // 24 34 25 35 26 36 27 37
        // 40 50 41 51 42 52 43 53
        // 60 70 61 71 62 72 63 73
        // 54 54 55 55 56 56 57 57
        // 64 74 65 75 66 76 67 77
        const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
        const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
        const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
        const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
        const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
        const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
        const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
        const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
        // 00 10 20 30 01 11 21 31
        // 40 50 60 70 41 51 61 71
        // 02 12 22 32 03 13 23 33
        // 42 52 62 72 43 53 63 73
        // 04 14 24 34 05 15 21 36
        // 44 54 64 74 45 55 61 76
        // 06 16 26 36 07 17 27 37
        // 46 56 66 76 47 57 67 77
        const __m128i tr2_0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
        const __m128i tr2_1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
        const __m128i tr2_2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
        const __m128i tr2_3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
        const __m128i tr2_4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
        const __m128i tr2_5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
        const __m128i tr2_6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
        const __m128i tr2_7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
        // 00 10 20 30 40 50 60 70
        // 01 11 21 31 41 51 61 71
        // 02 12 22 32 42 52 62 72
        // 03 13 23 33 43 53 63 73
        // 04 14 24 34 44 54 64 74
        // 05 15 25 35 45 55 65 75
        // 06 16 26 36 46 56 66 76
        // 07 17 27 37 47 57 67 77
        // Store results
        _mm_storeu_si128((__m128i *)(out + 8 + 0 * 16), tr2_0);
        _mm_storeu_si128((__m128i *)(out + 8 + 1 * 16), tr2_1);
        _mm_storeu_si128((__m128i *)(out + 8 + 2 * 16), tr2_2);
        _mm_storeu_si128((__m128i *)(out + 8 + 3 * 16), tr2_3);
        _mm_storeu_si128((__m128i *)(out + 8 + 4 * 16), tr2_4);
        _mm_storeu_si128((__m128i *)(out + 8 + 5 * 16), tr2_5);
        _mm_storeu_si128((__m128i *)(out + 8 + 6 * 16), tr2_6);
        _mm_storeu_si128((__m128i *)(out + 8 + 7 * 16), tr2_7);
      out += 8*16;
    // Setup in/out for next pass.
    in = intermediate;
    out = output;
void vp9_short_fdct8x8_sse2(int16_t *input, int16_t *output, int pitch) {
  const int stride = pitch >> 1;
  int pass;
  // Constants
  //    When we use them, in one case, they are all the same. In all others
  //    it's a pair of them that we need to repeat four times. This is done
  //    by constructing the 32 bit constant corresponding to that pair.
  const __m128i k__cospi_p16_p16 = _mm_set1_epi16(cospi_16_64);
  const __m128i k__cospi_p16_m16 = pair_set_epi16(cospi_16_64, -cospi_16_64);
  const __m128i k__cospi_p24_p08 = pair_set_epi16(cospi_24_64, cospi_8_64);
  const __m128i k__cospi_m08_p24 = pair_set_epi16(-cospi_8_64, cospi_24_64);
  const __m128i k__cospi_p28_p04 = pair_set_epi16(cospi_28_64, cospi_4_64);
  const __m128i k__cospi_m04_p28 = pair_set_epi16(-cospi_4_64, cospi_28_64);
  const __m128i k__cospi_p12_p20 = pair_set_epi16(cospi_12_64, cospi_20_64);
  const __m128i k__cospi_m20_p12 = pair_set_epi16(-cospi_20_64, cospi_12_64);
  const __m128i k__DCT_CONST_ROUNDING = _mm_set1_epi32(DCT_CONST_ROUNDING);
  // Load input
  __m128i in0  = _mm_loadu_si128((const __m128i *)(input + 0 * stride));
  __m128i in1  = _mm_loadu_si128((const __m128i *)(input + 1 * stride));
  __m128i in2  = _mm_loadu_si128((const __m128i *)(input + 2 * stride));
  __m128i in3  = _mm_loadu_si128((const __m128i *)(input + 3 * stride));
  __m128i in4  = _mm_loadu_si128((const __m128i *)(input + 4 * stride));
  __m128i in5  = _mm_loadu_si128((const __m128i *)(input + 5 * stride));
  __m128i in6  = _mm_loadu_si128((const __m128i *)(input + 6 * stride));
  __m128i in7  = _mm_loadu_si128((const __m128i *)(input + 7 * stride));
  // Pre-condition input (shift by two)
  in0 = _mm_slli_epi16(in0, 2);
  in1 = _mm_slli_epi16(in1, 2);
  in2 = _mm_slli_epi16(in2, 2);
  in3 = _mm_slli_epi16(in3, 2);
  in4 = _mm_slli_epi16(in4, 2);
  in5 = _mm_slli_epi16(in5, 2);
  in6 = _mm_slli_epi16(in6, 2);
  in7 = _mm_slli_epi16(in7, 2);

  // We do two passes, first the columns, then the rows. The results of the
  // first pass are transposed so that the same column code can be reused. The
  // results of the second pass are also transposed so that the rows (processed
  // as columns) are put back in row positions.
  for (pass = 0; pass < 2; pass++) {
    // To store results of each pass before the transpose.
    __m128i res0, res1, res2, res3, res4, res5, res6, res7;
    // Add/substract
    const __m128i q0 = _mm_add_epi16(in0, in7);
    const __m128i q1 = _mm_add_epi16(in1, in6);
    const __m128i q2 = _mm_add_epi16(in2, in5);
    const __m128i q3 = _mm_add_epi16(in3, in4);
    const __m128i q4 = _mm_sub_epi16(in3, in4);
    const __m128i q5 = _mm_sub_epi16(in2, in5);
    const __m128i q6 = _mm_sub_epi16(in1, in6);
    const __m128i q7 = _mm_sub_epi16(in0, in7);
    // Work on first four results
      // Add/substract
      const __m128i r0 = _mm_add_epi16(q0, q3);
      const __m128i r1 = _mm_add_epi16(q1, q2);
      const __m128i r2 = _mm_sub_epi16(q1, q2);
      const __m128i r3 = _mm_sub_epi16(q0, q3);
      // Interleave to do the multiply by constants which gets us into 32bits
      const __m128i t0 = _mm_unpacklo_epi16(r0, r1);
      const __m128i t1 = _mm_unpackhi_epi16(r0, r1);
      const __m128i t2 = _mm_unpacklo_epi16(r2, r3);
      const __m128i t3 = _mm_unpackhi_epi16(r2, r3);
      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p16_p16);
      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p16_p16);
      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_p16_m16);
      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_p16_m16);
      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p24_p08);
      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p24_p08);
      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m08_p24);
      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m08_p24);
      // dct_const_round_shift
      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
      // Combine
      res0 = _mm_packs_epi32(w0, w1);
      res4 = _mm_packs_epi32(w2, w3);
      res2 = _mm_packs_epi32(w4, w5);
      res6 = _mm_packs_epi32(w6, w7);
    // Work on next four results
      // Interleave to do the multiply by constants which gets us into 32bits
      const __m128i d0 = _mm_unpacklo_epi16(q6, q5);
      const __m128i d1 = _mm_unpackhi_epi16(q6, q5);
      const __m128i e0 = _mm_madd_epi16(d0, k__cospi_p16_m16);
      const __m128i e1 = _mm_madd_epi16(d1, k__cospi_p16_m16);
      const __m128i e2 = _mm_madd_epi16(d0, k__cospi_p16_p16);
      const __m128i e3 = _mm_madd_epi16(d1, k__cospi_p16_p16);
      // dct_const_round_shift
      const __m128i f0 = _mm_add_epi32(e0, k__DCT_CONST_ROUNDING);
      const __m128i f1 = _mm_add_epi32(e1, k__DCT_CONST_ROUNDING);
      const __m128i f2 = _mm_add_epi32(e2, k__DCT_CONST_ROUNDING);
      const __m128i f3 = _mm_add_epi32(e3, k__DCT_CONST_ROUNDING);
      const __m128i s0 = _mm_srai_epi32(f0, DCT_CONST_BITS);
      const __m128i s1 = _mm_srai_epi32(f1, DCT_CONST_BITS);
      const __m128i s2 = _mm_srai_epi32(f2, DCT_CONST_BITS);
      const __m128i s3 = _mm_srai_epi32(f3, DCT_CONST_BITS);
      // Combine
      const __m128i r0 = _mm_packs_epi32(s0, s1);
      const __m128i r1 = _mm_packs_epi32(s2, s3);
      // Add/substract
      const __m128i x0 = _mm_add_epi16(q4, r0);
      const __m128i x1 = _mm_sub_epi16(q4, r0);
      const __m128i x2 = _mm_sub_epi16(q7, r1);
      const __m128i x3 = _mm_add_epi16(q7, r1);
      // Interleave to do the multiply by constants which gets us into 32bits
      const __m128i t0 = _mm_unpacklo_epi16(x0, x3);
      const __m128i t1 = _mm_unpackhi_epi16(x0, x3);
      const __m128i t2 = _mm_unpacklo_epi16(x1, x2);
      const __m128i t3 = _mm_unpackhi_epi16(x1, x2);
      const __m128i u0 = _mm_madd_epi16(t0, k__cospi_p28_p04);
      const __m128i u1 = _mm_madd_epi16(t1, k__cospi_p28_p04);
      const __m128i u2 = _mm_madd_epi16(t0, k__cospi_m04_p28);
      const __m128i u3 = _mm_madd_epi16(t1, k__cospi_m04_p28);
      const __m128i u4 = _mm_madd_epi16(t2, k__cospi_p12_p20);
      const __m128i u5 = _mm_madd_epi16(t3, k__cospi_p12_p20);
      const __m128i u6 = _mm_madd_epi16(t2, k__cospi_m20_p12);
      const __m128i u7 = _mm_madd_epi16(t3, k__cospi_m20_p12);
      // dct_const_round_shift
      const __m128i v0 = _mm_add_epi32(u0, k__DCT_CONST_ROUNDING);
      const __m128i v1 = _mm_add_epi32(u1, k__DCT_CONST_ROUNDING);
      const __m128i v2 = _mm_add_epi32(u2, k__DCT_CONST_ROUNDING);
      const __m128i v3 = _mm_add_epi32(u3, k__DCT_CONST_ROUNDING);
      const __m128i v4 = _mm_add_epi32(u4, k__DCT_CONST_ROUNDING);
      const __m128i v5 = _mm_add_epi32(u5, k__DCT_CONST_ROUNDING);
      const __m128i v6 = _mm_add_epi32(u6, k__DCT_CONST_ROUNDING);
      const __m128i v7 = _mm_add_epi32(u7, k__DCT_CONST_ROUNDING);
      const __m128i w0 = _mm_srai_epi32(v0, DCT_CONST_BITS);
      const __m128i w1 = _mm_srai_epi32(v1, DCT_CONST_BITS);
      const __m128i w2 = _mm_srai_epi32(v2, DCT_CONST_BITS);
      const __m128i w3 = _mm_srai_epi32(v3, DCT_CONST_BITS);
      const __m128i w4 = _mm_srai_epi32(v4, DCT_CONST_BITS);
      const __m128i w5 = _mm_srai_epi32(v5, DCT_CONST_BITS);
      const __m128i w6 = _mm_srai_epi32(v6, DCT_CONST_BITS);
      const __m128i w7 = _mm_srai_epi32(v7, DCT_CONST_BITS);
      // Combine
      res1 = _mm_packs_epi32(w0, w1);
      res7 = _mm_packs_epi32(w2, w3);
      res5 = _mm_packs_epi32(w4, w5);
      res3 = _mm_packs_epi32(w6, w7);
    // Transpose the 8x8.
      // 00 01 02 03 04 05 06 07
      // 10 11 12 13 14 15 16 17
      // 20 21 22 23 24 25 26 27
      // 30 31 32 33 34 35 36 37
      // 40 41 42 43 44 45 46 47
      // 50 51 52 53 54 55 56 57
      // 60 61 62 63 64 65 66 67
      // 70 71 72 73 74 75 76 77
      const __m128i tr0_0 = _mm_unpacklo_epi16(res0, res1);
      const __m128i tr0_1 = _mm_unpacklo_epi16(res2, res3);
      const __m128i tr0_2 = _mm_unpackhi_epi16(res0, res1);
      const __m128i tr0_3 = _mm_unpackhi_epi16(res2, res3);
      const __m128i tr0_4 = _mm_unpacklo_epi16(res4, res5);
      const __m128i tr0_5 = _mm_unpacklo_epi16(res6, res7);
      const __m128i tr0_6 = _mm_unpackhi_epi16(res4, res5);
      const __m128i tr0_7 = _mm_unpackhi_epi16(res6, res7);
      // 00 10 01 11 02 12 03 13
      // 20 30 21 31 22 32 23 33
      // 04 14 05 15 06 16 07 17
      // 24 34 25 35 26 36 27 37
      // 40 50 41 51 42 52 43 53
      // 60 70 61 71 62 72 63 73
      // 54 54 55 55 56 56 57 57
      // 64 74 65 75 66 76 67 77
      const __m128i tr1_0 = _mm_unpacklo_epi32(tr0_0, tr0_1);
      const __m128i tr1_1 = _mm_unpacklo_epi32(tr0_2, tr0_3);
      const __m128i tr1_2 = _mm_unpackhi_epi32(tr0_0, tr0_1);
      const __m128i tr1_3 = _mm_unpackhi_epi32(tr0_2, tr0_3);
      const __m128i tr1_4 = _mm_unpacklo_epi32(tr0_4, tr0_5);
      const __m128i tr1_5 = _mm_unpacklo_epi32(tr0_6, tr0_7);
      const __m128i tr1_6 = _mm_unpackhi_epi32(tr0_4, tr0_5);
      const __m128i tr1_7 = _mm_unpackhi_epi32(tr0_6, tr0_7);
      // 00 10 20 30 01 11 21 31
      // 40 50 60 70 41 51 61 71
      // 02 12 22 32 03 13 23 33
      // 42 52 62 72 43 53 63 73
      // 04 14 24 34 05 15 21 36
      // 44 54 64 74 45 55 61 76
      // 06 16 26 36 07 17 27 37
      // 46 56 66 76 47 57 67 77
      in0 = _mm_unpacklo_epi64(tr1_0, tr1_4);
      in1 = _mm_unpackhi_epi64(tr1_0, tr1_4);
      in2 = _mm_unpacklo_epi64(tr1_2, tr1_6);
      in3 = _mm_unpackhi_epi64(tr1_2, tr1_6);
      in4 = _mm_unpacklo_epi64(tr1_1, tr1_5);
      in5 = _mm_unpackhi_epi64(tr1_1, tr1_5);
      in6 = _mm_unpacklo_epi64(tr1_3, tr1_7);
      in7 = _mm_unpackhi_epi64(tr1_3, tr1_7);
      // 00 10 20 30 40 50 60 70
      // 01 11 21 31 41 51 61 71
      // 02 12 22 32 42 52 62 72
      // 03 13 23 33 43 53 63 73
      // 04 14 24 34 44 54 64 74
      // 05 15 25 35 45 55 65 75
      // 06 16 26 36 46 56 66 76
      // 07 17 27 37 47 57 67 77
  // Post-condition output and store it
    // Post-condition (division by two)
    //    division of two 16 bits signed numbers using shifts
    //    n / 2 = (n - (n >> 15)) >> 1
    const __m128i sign_in0 = _mm_srai_epi16(in0, 15);
    const __m128i sign_in1 = _mm_srai_epi16(in1, 15);
    const __m128i sign_in2 = _mm_srai_epi16(in2, 15);
    const __m128i sign_in3 = _mm_srai_epi16(in3, 15);
    const __m128i sign_in4 = _mm_srai_epi16(in4, 15);
    const __m128i sign_in5 = _mm_srai_epi16(in5, 15);
    const __m128i sign_in6 = _mm_srai_epi16(in6, 15);
    const __m128i sign_in7 = _mm_srai_epi16(in7, 15);
    in0 = _mm_sub_epi16(in0, sign_in0);
    in1 = _mm_sub_epi16(in1, sign_in1);
    in2 = _mm_sub_epi16(in2, sign_in2);
    in3 = _mm_sub_epi16(in3, sign_in3);
    in4 = _mm_sub_epi16(in4, sign_in4);
    in5 = _mm_sub_epi16(in5, sign_in5);
    in6 = _mm_sub_epi16(in6, sign_in6);
    in7 = _mm_sub_epi16(in7, sign_in7);
    in0 = _mm_srai_epi16(in0, 1);
    in1 = _mm_srai_epi16(in1, 1);
    in2 = _mm_srai_epi16(in2, 1);
    in3 = _mm_srai_epi16(in3, 1);
    in4 = _mm_srai_epi16(in4, 1);
    in5 = _mm_srai_epi16(in5, 1);
    in6 = _mm_srai_epi16(in6, 1);
    in7 = _mm_srai_epi16(in7, 1);
    // store results
    _mm_storeu_si128((__m128i *)(output + 0 * 8), in0);
    _mm_storeu_si128((__m128i *)(output + 1 * 8), in1);
    _mm_storeu_si128((__m128i *)(output + 2 * 8), in2);
    _mm_storeu_si128((__m128i *)(output + 3 * 8), in3);
    _mm_storeu_si128((__m128i *)(output + 4 * 8), in4);
    _mm_storeu_si128((__m128i *)(output + 5 * 8), in5);
    _mm_storeu_si128((__m128i *)(output + 6 * 8), in6);
    _mm_storeu_si128((__m128i *)(output + 7 * 8), in7);
 * @brief This function performs a 4x4 inverse hadamard transform on the 4x4 DC coefficients
 * of a 16x16 intra prediction macroblock, and then performs scaling.
 * prediction buffer
 * @par Description:
 *  The DC coefficients pass through a 2-stage inverse hadamard transform.
 *  This inverse transformed content is scaled to based on Qp value.
 * @param[in] pi2_src
 *  input 4x4 block of DC coefficients
 * @param[out] pi2_out
 *  output 4x4 block
 * @param[in] pu2_iscal_mat
 *  pointer to scaling list
 * @param[in] pu2_weigh_mat
 *  pointer to weight matrix
 * @param[in] u4_qp_div_6
 *  Floor (qp/6)
 * @param[in] pi4_tmp
 * temporary buffer of size 1*16
 * @returns none
 * @remarks none
void ih264_ihadamard_scaling_4x4_ssse3(WORD16* pi2_src,
                                       WORD16* pi2_out,
                                       const UWORD16 *pu2_iscal_mat,
                                       const UWORD16 *pu2_weigh_mat,
                                       UWORD32 u4_qp_div_6,
                                       WORD32* pi4_tmp)
    int val = 0xFFFF;
    __m128i src_r0_r1, src_r2_r3, sign_reg, zero_8x16b = _mm_setzero_si128();
    __m128i src_r0, src_r1, src_r2, src_r3;
    __m128i temp0, temp1, temp2, temp3;
    __m128i add_rshift = _mm_set1_epi32((1 << (5 - u4_qp_div_6)));
    __m128i mult_val = _mm_set1_epi32(pu2_iscal_mat[0] * pu2_weigh_mat[0]);

    __m128i mask = _mm_set1_epi32(val);
    UNUSED (pi4_tmp);

    mult_val = _mm_and_si128(mult_val, mask);

    src_r0_r1 = _mm_loadu_si128((__m128i *) (pi2_src)); //a00 a01 a02 a03 a10 a11 a12 a13 -- the source matrix 0th,1st row
    src_r2_r3 = _mm_loadu_si128((__m128i *) (pi2_src + 8)); //a20 a21 a22 a23 a30 a31 a32 a33 -- the source matrix 2nd,3rd row
    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r0_r1);
    src_r0 = _mm_unpacklo_epi16(src_r0_r1, sign_reg);
    src_r1 = _mm_unpackhi_epi16(src_r0_r1, sign_reg);
    sign_reg = _mm_cmpgt_epi16(zero_8x16b, src_r2_r3);
    src_r2 = _mm_unpacklo_epi16(src_r2_r3, sign_reg);
    src_r3 = _mm_unpackhi_epi16(src_r2_r3, sign_reg);

    /* Perform Inverse transform */
    /* IDCT [ Horizontal transformation ]                          */
    // Matrix transpose
     *  a0 a1 a2 a3
     *  b0 b1 b2 b3
     *  c0 c1 c2 c3
     *  d0 d1 d2 d3
    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 b0 a1 b1
    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //c0 d0 c1 d1
    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //a2 b2 a3 b3
    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 d2 c3 d3
    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                    //a0 b0 c0 d0
    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                    //a1 b1 c1 d1
    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                    //a2 b2 c2 d2
    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                    //a3 b3 c3 d3

    temp0 = _mm_add_epi32(src_r0, src_r3);
    temp1 = _mm_add_epi32(src_r1, src_r2);
    temp2 = _mm_sub_epi32(src_r1, src_r2);
    temp3 = _mm_sub_epi32(src_r0, src_r3);

    src_r0 = _mm_add_epi32(temp0, temp1);
    src_r1 = _mm_add_epi32(temp2, temp3);
    src_r2 = _mm_sub_epi32(temp0, temp1);
    src_r3 = _mm_sub_epi32(temp3, temp2);

    /* IDCT [ Vertical transformation ]                          */
    // Matrix transpose
     *  a0 b0 c0 d0
     *  a1 b1 c1 d1
     *  a2 b2 c2 d2
     *  a3 b3 c3 d3
    temp0 = _mm_unpacklo_epi32(src_r0, src_r1);                  //a0 a1 b0 b1
    temp2 = _mm_unpacklo_epi32(src_r2, src_r3);                  //a2 a3 b2 b3
    temp1 = _mm_unpackhi_epi32(src_r0, src_r1);                  //c0 c1 d0 d1
    temp3 = _mm_unpackhi_epi32(src_r2, src_r3);                  //c2 c3 d2 d3
    src_r0 = _mm_unpacklo_epi64(temp0, temp2);                   //a0 a1 a2 a3
    src_r1 = _mm_unpackhi_epi64(temp0, temp2);                   //b0 b1 b2 b3
    src_r2 = _mm_unpacklo_epi64(temp1, temp3);                   //c0 c1 c2 c3
    src_r3 = _mm_unpackhi_epi64(temp1, temp3);                   //d0 d1 d2 d3

    temp0 = _mm_add_epi32(src_r0, src_r3);
    temp1 = _mm_add_epi32(src_r1, src_r2);
    temp2 = _mm_sub_epi32(src_r1, src_r2);
    temp3 = _mm_sub_epi32(src_r0, src_r3);

    src_r0 = _mm_add_epi32(temp0, temp1);
    src_r1 = _mm_add_epi32(temp2, temp3);
    src_r2 = _mm_sub_epi32(temp0, temp1);
    src_r3 = _mm_sub_epi32(temp3, temp2);

    src_r0 = _mm_and_si128(src_r0, mask);
    src_r1 = _mm_and_si128(src_r1, mask);
    src_r2 = _mm_and_si128(src_r2, mask);
    src_r3 = _mm_and_si128(src_r3, mask);

    src_r0 = _mm_madd_epi16(src_r0, mult_val);
    src_r1 = _mm_madd_epi16(src_r1, mult_val);
    src_r2 = _mm_madd_epi16(src_r2, mult_val);
    src_r3 = _mm_madd_epi16(src_r3, mult_val);

    if(u4_qp_div_6 >= 6)
        src_r0 = _mm_slli_epi32(src_r0, u4_qp_div_6 - 6);
        src_r1 = _mm_slli_epi32(src_r1, u4_qp_div_6 - 6);
        src_r2 = _mm_slli_epi32(src_r2, u4_qp_div_6 - 6);
        src_r3 = _mm_slli_epi32(src_r3, u4_qp_div_6 - 6);
        temp0 = _mm_add_epi32(src_r0, add_rshift);
        temp1 = _mm_add_epi32(src_r1, add_rshift);
        temp2 = _mm_add_epi32(src_r2, add_rshift);
        temp3 = _mm_add_epi32(src_r3, add_rshift);
        src_r0 = _mm_srai_epi32(temp0, 6 - u4_qp_div_6);
        src_r1 = _mm_srai_epi32(temp1, 6 - u4_qp_div_6);
        src_r2 = _mm_srai_epi32(temp2, 6 - u4_qp_div_6);
        src_r3 = _mm_srai_epi32(temp3, 6 - u4_qp_div_6);
    src_r0_r1 = _mm_packs_epi32(src_r0, src_r1);
    src_r2_r3 = _mm_packs_epi32(src_r2, src_r3);

    _mm_storeu_si128((__m128i *) (&pi2_out[0]), src_r0_r1);
    _mm_storeu_si128((__m128i *) (&pi2_out[8]), src_r2_r3);
Exemple #14
/* motion templates */
CV_IMPL void
cvUpdateMotionHistory( const void* silhouette, void* mhimg,
                       double timestamp, double mhi_duration )
    CvMat  silhstub, *silh = cvGetMat(silhouette, &silhstub);
    CvMat  mhistub, *mhi = cvGetMat(mhimg, &mhistub);

    if( !CV_IS_MASK_ARR( silh ))
        CV_Error( CV_StsBadMask, "" );

    if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 )
        CV_Error( CV_StsUnsupportedFormat, "" );

    if( !CV_ARE_SIZES_EQ( mhi, silh ))
        CV_Error( CV_StsUnmatchedSizes, "" );

    CvSize size = cvGetMatSize( mhi );

    if( CV_IS_MAT_CONT( mhi->type & silh->type ))
        size.width *= size.height;
        size.height = 1;

    float ts = (float)timestamp;
    float delbound = (float)(timestamp - mhi_duration);
    int x, y;
#if CV_SSE2
    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);

    for( y = 0; y < size.height; y++ )
        const uchar* silhData = silh->data.ptr + silh->step*y;
        float* mhiData = (float*)(mhi->data.ptr + mhi->step*y);
        x = 0;

#if CV_SSE2
        if( useSIMD )
            __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound);
            for( ; x <= size.width - 8; x += 8 )
                __m128i z = _mm_setzero_si128();
                __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z);
                __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z));
                __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4);
                __m128 fz = _mm_setzero_ps();

                v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4));
                v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4));

                __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz));
                __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz));

                v0 = _mm_xor_ps(v0, m0);
                v1 = _mm_xor_ps(v1, m1);

                _mm_storeu_ps(mhiData + x, v0);
                _mm_storeu_ps(mhiData + x + 4, v1);

        for( ; x < size.width; x++ )
            float val = mhiData[x];
            val = silhData[x] ? ts : val < delbound ? 0 : val;
            mhiData[x] = val;
Exemple #15
/// REPLACE HERE WITH SSE intrinsics
static void partialButterflyInverse16_simd(short *src, short *dst, int shift)

  int add = 1<<(shift-1);

//we cast the original 16X16 matrix to an SIMD vector type
    __m128i *g_aiT16_vec  = (__m128i *)g_aiT16; 

//We cast the input source (which is basically random numbers(see the main function for details)) to an SIMD vector type
//We also cast the output to an SIMD vector type
  __m128i *in_vec = (__m128i *) src;   
  __m128i *out_vec = (__m128i *) dst;

//we declare an 8X8 array and cast it to an SIMD vector type
  short gt[8][8] __attribute__ ((aligned (16)));
  __m128i *gt_vec = (__m128i *)gt;

//we declare an 16X16 array and cast it to an SIMD vector type
  short random[16][16] __attribute__ ((aligned (16)));
  __m128i *random_vec = (__m128i *)random;  


tranpose8x8(in_vec,2, random_vec,0);
tranpose8x8(in_vec,3, random_vec,8);
tranpose8x8(in_vec,0, random_vec,16);
tranpose8x8(in_vec,1, random_vec,24);

  for (int j=0; j<16; j++)
    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    __m128i I0 = _mm_load_si128 (&random_vec[j]); 
    __m128i II0 = _mm_load_si128 (&random_vec[j+16]); 

  // for (int k=0; k<8; k++)
          //here we are loading up the transposed values in the initial matrix
          //multiplying it with the input numbers to produce intermediate 32-bit integers
          // we then sum up adjacent pairs of 32-bit integers and store them in the destination register
        __m128i I1 = _mm_load_si128 (&gt_vec[0]);   
        __m128i I2 = _mm_madd_epi16 (I1, I0);
        __m128i I3 = _mm_load_si128 (&gt_vec[1]);   
        __m128i I4 = _mm_madd_epi16 (I3, I0);
        __m128i I5 = _mm_load_si128 (&gt_vec[2]);   
        __m128i I6 = _mm_madd_epi16 (I5, I0);

        __m128i I7 = _mm_load_si128 (&gt_vec[3]);   
        __m128i I8 = _mm_madd_epi16 (I7, I0);

        __m128i I9 = _mm_load_si128 (&gt_vec[4]);   
        __m128i I10 = _mm_madd_epi16 (I9, I0);

        __m128i I11 = _mm_load_si128 (&gt_vec[5]);   
        __m128i I12 = _mm_madd_epi16 (I11, I0);

        __m128i I13 = _mm_load_si128 (&gt_vec[6]);   
        __m128i I14 = _mm_madd_epi16 (I13, I0);

        __m128i I15 = _mm_load_si128 (&gt_vec[7]);   
        __m128i I16 = _mm_madd_epi16 (I15, I0);

        //horizontally add the partial results obtained from thee previous step
       __m128i A1 =_mm_hadd_epi32 (I2, I4);
       __m128i A2 =_mm_hadd_epi32 (I6, I8);
       __m128i R1 =_mm_hadd_epi32 (A1, A2);

       __m128i A3 =_mm_hadd_epi32 (I10, I12);
       __m128i A4 =_mm_hadd_epi32 (I14, I16);
       __m128i R2 =_mm_hadd_epi32 (A3, A4);
      //  O[k] = T[0]+T[1]+T[2]+T[3];    
  //  for (int k=0; k<4; k++)
 //   {
       //load the original matrix values, multiply it with the random values
       //store the low bits to I2 and the hi bits to I3
       I1 = _mm_load_si128 (&gt_vec[8]);       
       I2 = _mm_mullo_epi16 (I1, II0);
       I3 = _mm_mulhi_epi16 (I1, II0);

      __m128i lowI23 = _mm_unpacklo_epi16(I2,I3);
      __m128i hiI23 = _mm_unpackhi_epi16(I2,I3);    
      __m128i temp1 = _mm_add_epi32(lowI23,hiI23);
      __m128i temp5 = _mm_hsub_epi32 (lowI23, hiI23);

       I4 = _mm_load_si128 (&gt_vec[9]);       
       I5 = _mm_mullo_epi16 (I4, II0);
       I6 = _mm_mulhi_epi16 (I4, II0);
      __m128i lowI56 = _mm_unpacklo_epi16(I5,I6);
      __m128i hiI56 = _mm_unpackhi_epi16(I5,I6);    
      __m128i temp2 = _mm_add_epi32(lowI56,hiI56);  
      __m128i temp6 = _mm_hsub_epi32 (lowI56, hiI56);   
       I7 = _mm_load_si128 (&gt_vec[10]);      
       I8 = _mm_mullo_epi16 (I7, II0);
       I9 = _mm_mulhi_epi16 (I7, II0);
      __m128i lowI89 = _mm_unpacklo_epi16(I8,I9);
      __m128i hiI89 = _mm_unpackhi_epi16(I8,I9);    
      __m128i temp3 = _mm_add_epi32(lowI89,hiI89);  
      __m128i temp7 = _mm_hsub_epi32 (lowI89, hiI89);    

       I10 = _mm_load_si128 (&gt_vec[11]);       
       I11 = _mm_mullo_epi16 (I10, II0);
       I12 = _mm_mulhi_epi16 (I10, II0);
      __m128i lowI1112 = _mm_unpacklo_epi16(I11,I12);
      __m128i hiI1112 = _mm_unpackhi_epi16(I11,I12);    
      __m128i temp4 = _mm_add_epi32(lowI1112,hiI1112);  
      __m128i temp8 = _mm_hsub_epi32 (lowI1112, hiI1112);   
       __m128i A5 =_mm_hadd_epi32 (temp1, temp2);
       __m128i A6 =_mm_hadd_epi32 (temp3, temp4);
       __m128i R3 =_mm_hadd_epi32 (A5, A6);

       __m128i A7 =_mm_hadd_epi32 (temp8, temp7);
       __m128i A8 =_mm_hadd_epi32 (temp6, temp5);
       __m128i R4 =_mm_hadd_epi32 (A7, A8);

         __m128i add_reg = _mm_set1_epi32(add);

         __m128i sum_vec0 = _mm_add_epi32(R3,R1);        
         sum_vec0 = _mm_add_epi32(sum_vec0,add_reg);
         sum_vec0 = _mm_srai_epi32(sum_vec0, shift); // shift right
         __m128i sum_vec1 = _mm_add_epi32(R4,R2);
         sum_vec1 = _mm_add_epi32(sum_vec1,add_reg);
         sum_vec1 = _mm_srai_epi32(sum_vec1, shift); // shift right

	 __m128i finalres0 = _mm_packs_epi32(sum_vec0, sum_vec1); // shrink packed 32bit to packed 16 bit and saturate
         _mm_store_si128 (&out_vec[2*j], finalres0);
        __m128i  sum_vec2 = _mm_sub_epi32(R4, R2);
         sum_vec2 = _mm_add_epi32(sum_vec2,add_reg);
         sum_vec2 = _mm_srai_epi32(sum_vec2, shift); // shift right  	 

         __m128i sum_vec3 = _mm_sub_epi32(R3, R1);
         sum_vec3 = _mm_add_epi32(sum_vec3,add_reg);
         sum_vec3 = _mm_srai_epi32(sum_vec3, shift); // shift right

         I5 = _mm_unpackhi_epi32(sum_vec2, sum_vec3);
         I6 = _mm_unpacklo_epi32(sum_vec2, sum_vec3);
         I7 = _mm_unpackhi_epi32(I5, I6);
         I8 = _mm_unpacklo_epi32(I5, I6);
         I9 = _mm_unpacklo_epi32(I7, I8);
         I10 = _mm_unpackhi_epi32(I7, I8);
	 sum_vec3 = _mm_packs_epi32(I9, I10); // shrink packed 32bit to packed 16 bit and saturate
         _mm_store_si128 (&out_vec[2*j+1], sum_vec3);
Exemple #16
/*__forceinline*/ bool Cmp_ClutBuffer_SavedClut<u16>(u16* saved_clut, u32 csa, u32 clutsize)
	assert((clutsize&31) == 0);

#ifdef ZEROGS_SSE2
    __m128i zero_128 = _mm_setzero_si128();
    u16* clut = (u16*)GetClutBufferAddress<u32>(csa); // Keep aligned version for sse2

    // which side to cmp
    u32 clutsize_right;
    u32 clutsize_left;
    if (csa < 16) {
        clutsize_right = min(clutsize, (16-csa)*32);
        clutsize_left = clutsize - clutsize_right;
    } else {
        clutsize_right = 0;
        clutsize_left = clutsize;

    while (clutsize_right > 0)
#ifdef ZEROGS_SSE2
        // only lower 16 bits of dword are valid
        __m128i clut_0 = _mm_load_si128((__m128i*)clut);
        __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
        __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
        __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);

        // value must converted to 32 bits
        __m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut);
        __m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1);

        __m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_0, zero_128), clut_0);
        __m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_0, zero_128), clut_1);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(saved_clut_1, zero_128), clut_2);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(saved_clut_1, zero_128), clut_3);
        result = _mm_and_si128(result, result_tmp);

        u32 result_int = _mm_movemask_epi8(result);
        // only lower 16bits must be checked
        if ((result_int&0x3333) != 0x3333)
            return true;
		for (int i = 0; i < 16; ++i)
			if (saved_clut[i] != clut[2*i]) return true;

		saved_clut += 16;
		clut += 32;
        clutsize_right -= 32;

    if(csa < 16) {
        // go back to the base before processing left clut column
        clut = (u16*)GetClutBufferAddress<u32>(0); // Keep aligned version for sse2

    while (clutsize_left > 0)
#ifdef ZEROGS_SSE2
        // only higher 16 bits of dword are valid
        __m128i clut_0 = _mm_load_si128((__m128i*)clut);
        __m128i clut_1 = _mm_load_si128((__m128i*)clut+1);
        __m128i clut_2 = _mm_load_si128((__m128i*)clut+2);
        __m128i clut_3 = _mm_load_si128((__m128i*)clut+3);

        // value must converted to 32 bits (with 0 in lower 16 bits)
        __m128i saved_clut_0 = _mm_load_si128((__m128i*)saved_clut);
        __m128i saved_clut_1 = _mm_load_si128((__m128i*)saved_clut+1);

        __m128i result = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_0), clut_0);
        __m128i result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_0), clut_1);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi16(_mm_unpacklo_epi16(zero_128, saved_clut_1), clut_2);
        result = _mm_and_si128(result, result_tmp);

        result_tmp = _mm_cmpeq_epi16(_mm_unpackhi_epi16(zero_128, saved_clut_1), clut_3);
        result = _mm_and_si128(result, result_tmp);

        u32 result_int = _mm_movemask_epi8(result);
        // only higher 16bits must be checked
        if ((result_int&0xCCCC) != 0xCCCC)
            return true;
        // Note +1 because we change higher 16 bits
        for (int i = 0; i < 16; ++i)
            if (saved_clut[i] != clut[2*i+1]) return true;

        saved_clut += 16;
        clut += 32;
        clutsize_left -= 32;

	return false;
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i max = _mm_set1_epi32(0xFFFF);
    __m128 matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_set1_ps((float)ch->m[i]);

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);
        uint16_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        for (int x = 0; x < width; x += 8) {
            __m128 sum[2] = {(__m128)zero, (__m128)zero};

            for (int i = 0; i < 25; i++) {
                __m128i xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                __m128 xmm1 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(xmm0, zero));
                __m128 xmm2 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(xmm0, zero));
                xmm1 = _mm_mul_ps(xmm1, matrix[i]);
                xmm2 = _mm_mul_ps(xmm2, matrix[i]);
                sum[0] = _mm_add_ps(sum[0], xmm1);
                sum[1] = _mm_add_ps(sum[1], xmm2);

            __m128i sumi[2];
            for (int i = 0; i < 2; i++) {
                sum[i]  = _mm_mul_ps(sum[i], rdiv);
                sum[i]  = _mm_add_ps(sum[i], bias);
                if (!ch->saturate) {
                    sum[i] = mm_abs_ps(sum[i]);
                sumi[i] = _mm_cvtps_epi32(sum[i]);
                sumi[i] = mm_min_epi32(sumi[i], max);
                __m128i mask = _mm_cmpgt_epi32(sumi[i], zero);
                sumi[i] = _mm_and_si128(sumi[i], mask);

            sumi[0] = mm_cast_epi32(sumi[0], sumi[1]);

            _mm_store_si128((__m128i *)(dstp + x), sumi[0]);
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
Exemple #18
static void FTransformSSE2(const uint8_t* src, const uint8_t* ref,
                           int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k7500 = _mm_set1_epi32(7500);
  const __m128i k14500 = _mm_set1_epi32(14500);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);

  __m128i v01, v32;

  // Difference between src and ref and initial transpose.
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((__m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((__m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((__m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((__m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((__m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((__m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((__m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((__m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference.
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

    // Transpose.
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i transpose0_0 = _mm_unpacklo_epi16(diff0, diff1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(diff2, diff3);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // a02 a12 a22 a32   a03 a13 a23 a33
    // a00 a10 a20 a30   a01 a11 a21 a31
    // a03 a13 a23 a33   a02 a12 a22 a32

  // First pass and subsequent transpose.
    // Same operations are done on the (0,3) and (1,2) pairs.
    // b0 = (a0 + a3) << 3
    // b1 = (a1 + a2) << 3
    // b3 = (a0 - a3) << 3
    // b2 = (a1 - a2) << 3
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i b01 = _mm_slli_epi16(a01, 3);
    const __m128i b32 = _mm_slli_epi16(a32, 3);
    const __m128i b11 = _mm_unpackhi_epi64(b01, b01);
    const __m128i b22 = _mm_unpackhi_epi64(b32, b32);

    // e0 = b0 + b1
    // e2 = b0 - b1
    const __m128i e0 = _mm_add_epi16(b01, b11);
    const __m128i e2 = _mm_sub_epi16(b01, b11);
    const __m128i e02 = _mm_unpacklo_epi64(e0, e2);

    // e1 = (b3 * 5352 + b2 * 2217 + 14500) >> 12
    // e3 = (b3 * 2217 - b2 * 5352 +  7500) >> 12
    const __m128i b23 = _mm_unpacklo_epi16(b22, b32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k14500);
    const __m128i d3 = _mm_add_epi32(c3, k7500);
    const __m128i e1 = _mm_srai_epi32(d1, 12);
    const __m128i e3 = _mm_srai_epi32(d3, 12);
    const __m128i e13 = _mm_packs_epi32(e1, e3);

    // Transpose.
    // 00 01 02 03  20 21 22 23
    // 10 11 12 13  30 31 32 33
    const __m128i transpose0_0 = _mm_unpacklo_epi16(e02, e13);
    const __m128i transpose0_1 = _mm_unpackhi_epi16(e02, e13);
    // 00 10 01 11   02 12 03 13
    // 20 30 21 31   22 32 23 33
    const __m128i v23 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    v01 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));
    // 02 12 22 32   03 13 23 33
    // 00 10 20 30   01 11 21 31
    // 03 13 23 33   02 12 22 32

  // Second pass
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i b0 = _mm_add_epi16(a01, a11);
    const __m128i b2 = _mm_sub_epi16(a01, a11);
    const __m128i c0 = _mm_add_epi16(b0, seven);
    const __m128i c2 = _mm_add_epi16(b2, seven);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    _mm_storel_epi64((__m128i*)&out[ 0], d0);
    _mm_storel_epi64((__m128i*)&out[ 4], g1);
    _mm_storel_epi64((__m128i*)&out[ 8], d2);
    _mm_storel_epi64((__m128i*)&out[12], f3);
static void aom_filter_block1d4_v4_ssse3(
    const uint8_t *src_ptr, ptrdiff_t src_pitch, uint8_t *output_ptr,
    ptrdiff_t out_pitch, uint32_t output_height, const int16_t *filter) {
  __m128i filtersReg;
  __m128i addFilterReg32;
  __m128i srcReg2, srcReg3, srcReg23, srcReg4, srcReg34, srcReg5, srcReg45,
      srcReg6, srcReg56;
  __m128i srcReg23_34_lo, srcReg45_56_lo;
  __m128i srcReg2345_3456_lo, srcReg2345_3456_hi;
  __m128i resReglo, resReghi;
  __m128i firstFilters;
  unsigned int i;
  ptrdiff_t src_stride, dst_stride;

  addFilterReg32 = _mm_set1_epi16(32);
  filtersReg = _mm_loadu_si128((const __m128i *)filter);
  // converting the 16 bit (short) to  8 bit (byte) and have the
  // same data in both lanes of 128 bit register.
  filtersReg = _mm_srai_epi16(filtersReg, 1);
  filtersReg = _mm_packs_epi16(filtersReg, filtersReg);

  firstFilters = _mm_shuffle_epi8(filtersReg, _mm_set1_epi32(0x5040302u));

  // multiple the size of the source and destination stride by two
  src_stride = src_pitch << 1;
  dst_stride = out_pitch << 1;

  srcReg2 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 2));
  srcReg3 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 3));
  srcReg23 = _mm_unpacklo_epi32(srcReg2, srcReg3);

  srcReg4 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 4));

  // have consecutive loads on the same 256 register
  srcReg34 = _mm_unpacklo_epi32(srcReg3, srcReg4);

  srcReg23_34_lo = _mm_unpacklo_epi8(srcReg23, srcReg34);

  for (i = output_height; i > 1; i -= 2) {
    srcReg5 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 5));
    srcReg45 = _mm_unpacklo_epi32(srcReg4, srcReg5);

    srcReg6 = _mm_loadl_epi64((const __m128i *)(src_ptr + src_pitch * 6));
    srcReg56 = _mm_unpacklo_epi32(srcReg5, srcReg6);

    // merge every two consecutive registers
    srcReg45_56_lo = _mm_unpacklo_epi8(srcReg45, srcReg56);

    srcReg2345_3456_lo = _mm_unpacklo_epi16(srcReg23_34_lo, srcReg45_56_lo);
    srcReg2345_3456_hi = _mm_unpackhi_epi16(srcReg23_34_lo, srcReg45_56_lo);

    // multiply 2 adjacent elements with the filter and add the result
    resReglo = _mm_maddubs_epi16(srcReg2345_3456_lo, firstFilters);
    resReghi = _mm_maddubs_epi16(srcReg2345_3456_hi, firstFilters);

    resReglo = _mm_hadds_epi16(resReglo, _mm_setzero_si128());
    resReghi = _mm_hadds_epi16(resReghi, _mm_setzero_si128());

    // shift by 6 bit each 16 bit
    resReglo = _mm_adds_epi16(resReglo, addFilterReg32);
    resReghi = _mm_adds_epi16(resReghi, addFilterReg32);
    resReglo = _mm_srai_epi16(resReglo, 6);
    resReghi = _mm_srai_epi16(resReghi, 6);

    // shrink to 8 bit each 16 bits, the first lane contain the first
    // convolve result and the second lane contain the second convolve
    // result
    resReglo = _mm_packus_epi16(resReglo, resReglo);
    resReghi = _mm_packus_epi16(resReghi, resReghi);

    src_ptr += src_stride;

    *((uint32_t *)(output_ptr)) = _mm_cvtsi128_si32(resReglo);
    *((uint32_t *)(output_ptr + out_pitch)) = _mm_cvtsi128_si32(resReghi);

    output_ptr += dst_stride;

    // save part of the registers for next strides
    srcReg23_34_lo = srcReg45_56_lo;
    srcReg4 = srcReg6;
Exemple #20
// Hadamard transform
// Returns the difference between the weighted sum of the absolute value of
// transformed coefficients.
static int TTransformSSE2(const uint8_t* inA, const uint8_t* inB,
                          const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;
  const __m128i zero = _mm_setzero_si128();
  const __m128i one = _mm_set1_epi16(1);
  const __m128i three = _mm_set1_epi16(3);

  // Load, combine and tranpose inputs.
    const __m128i inA_0 = _mm_loadl_epi64((__m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadl_epi64((__m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadl_epi64((__m128i*)&inA[BPS * 2]);
    const __m128i inA_3 = _mm_loadl_epi64((__m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadl_epi64((__m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadl_epi64((__m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadl_epi64((__m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((__m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi8(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi8(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi8(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi8(inA_3, inB_3);
    // a00 b00 a01 b01 a02 b03 a03 b03   0 0 0 0 0 0 0 0
    // a10 b10 a11 b11 a12 b12 a13 b13   0 0 0 0 0 0 0 0
    // a20 b20 a21 b21 a22 b22 a23 b23   0 0 0 0 0 0 0 0
    // a30 b30 a31 b31 a32 b32 a33 b33   0 0 0 0 0 0 0 0

    // Transpose the two 4x4, discarding the filling zeroes.
    const __m128i transpose0_0 = _mm_unpacklo_epi8(inAB_0, inAB_2);
    const __m128i transpose0_1 = _mm_unpacklo_epi8(inAB_1, inAB_3);
    // a00 a20  b00 b20  a01 a21  b01 b21  a02 a22  b02 b22  a03 a23  b03 b23
    // a10 a30  b10 b30  a11 a31  b11 b31  a12 a32  b12 b32  a13 a33  b13 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi8(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpackhi_epi8(transpose0_0, transpose0_1);
    // a00 a10 a20 a30  b00 b10 b20 b30  a01 a11 a21 a31  b01 b11 b21 b31
    // a02 a12 a22 a32  b02 b12 b22 b32  a03 a13 a23 a33  b03 b13 b23 b33

    // Convert to 16b.
    tmp_0 = _mm_unpacklo_epi8(transpose1_0, zero);
    tmp_1 = _mm_unpackhi_epi8(transpose1_0, zero);
    tmp_2 = _mm_unpacklo_epi8(transpose1_1, zero);
    tmp_3 = _mm_unpackhi_epi8(transpose1_1, zero);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33

  // Horizontal pass and subsequent transpose.
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_slli_epi16(_mm_add_epi16(tmp_0, tmp_2), 2);
    const __m128i a1 = _mm_slli_epi16(_mm_add_epi16(tmp_1, tmp_3), 2);
    const __m128i a2 = _mm_slli_epi16(_mm_sub_epi16(tmp_1, tmp_3), 2);
    const __m128i a3 = _mm_slli_epi16(_mm_sub_epi16(tmp_0, tmp_2), 2);
    // b0_extra = (a0 != 0);
    const __m128i b0_extra = _mm_andnot_si128(_mm_cmpeq_epi16 (a0, zero), one);
    const __m128i b0_base = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    const __m128i b0 = _mm_add_epi16(b0_base, b0_extra);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    const __m128i transpose0_0 = _mm_unpacklo_epi16(b0, b1);
    const __m128i transpose0_1 = _mm_unpacklo_epi16(b2, b3);
    const __m128i transpose0_2 = _mm_unpackhi_epi16(b0, b1);
    const __m128i transpose0_3 = _mm_unpackhi_epi16(b2, b3);
    // a00 a10 a01 a11   a02 a12 a03 a13
    // a20 a30 a21 a31   a22 a32 a23 a33
    // b00 b10 b01 b11   b02 b12 b03 b13
    // b20 b30 b21 b31   b22 b32 b23 b33
    const __m128i transpose1_0 = _mm_unpacklo_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_1 = _mm_unpacklo_epi32(transpose0_2, transpose0_3);
    const __m128i transpose1_2 = _mm_unpackhi_epi32(transpose0_0, transpose0_1);
    const __m128i transpose1_3 = _mm_unpackhi_epi32(transpose0_2, transpose0_3);
    // a00 a10 a20 a30 a01 a11 a21 a31
    // b00 b10 b20 b30 b01 b11 b21 b31
    // a02 a12 a22 a32 a03 a13 a23 a33
    // b02 b12 a22 b32 b03 b13 b23 b33
    tmp_0 = _mm_unpacklo_epi64(transpose1_0, transpose1_1);
    tmp_1 = _mm_unpackhi_epi64(transpose1_0, transpose1_1);
    tmp_2 = _mm_unpacklo_epi64(transpose1_2, transpose1_3);
    tmp_3 = _mm_unpackhi_epi64(transpose1_2, transpose1_3);
    // a00 a10 a20 a30   b00 b10 b20 b30
    // a01 a11 a21 a31   b01 b11 b21 b31
    // a02 a12 a22 a32   b02 b12 b22 b32
    // a03 a13 a23 a33   b03 b13 b23 b33

  // Vertical pass and difference of weighted sums.
    // Load all inputs.
    // TODO(cduvivier): Make variable declarations and allocations aligned so
    //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
    const __m128i w_0 = _mm_loadu_si128((__m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((__m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

      // sign(b) = b >> 15  (0x0000 if positive, 0xffff if negative)
      const __m128i sign_A_b0 = _mm_srai_epi16(A_b0, 15);
      const __m128i sign_A_b2 = _mm_srai_epi16(A_b2, 15);
      const __m128i sign_B_b0 = _mm_srai_epi16(B_b0, 15);
      const __m128i sign_B_b2 = _mm_srai_epi16(B_b2, 15);

      // b = abs(b) = (b ^ sign) - sign
      A_b0 = _mm_xor_si128(A_b0, sign_A_b0);
      A_b2 = _mm_xor_si128(A_b2, sign_A_b2);
      B_b0 = _mm_xor_si128(B_b0, sign_B_b0);
      B_b2 = _mm_xor_si128(B_b2, sign_B_b2);
      A_b0 = _mm_sub_epi16(A_b0, sign_A_b0);
      A_b2 = _mm_sub_epi16(A_b2, sign_A_b2);
      B_b0 = _mm_sub_epi16(B_b0, sign_B_b0);
      B_b2 = _mm_sub_epi16(B_b2, sign_B_b2);

    // b = abs(b) + 3
    A_b0 = _mm_add_epi16(A_b0, three);
    A_b2 = _mm_add_epi16(A_b2, three);
    B_b0 = _mm_add_epi16(B_b0, three);
    B_b2 = _mm_add_epi16(B_b2, three);

    // abs((b + (b<0) + 3) >> 3) = (abs(b) + 3) >> 3
    // b = (abs(b) + 3) >> 3
    A_b0 = _mm_srai_epi16(A_b0, 3);
    A_b2 = _mm_srai_epi16(A_b2, 3);
    B_b0 = _mm_srai_epi16(B_b0, 3);
    B_b2 = _mm_srai_epi16(B_b2, 3);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b0 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b0);
  return sum[0] + sum[1] + sum[2] + sum[3];
Exemple #21
		SIMD_INLINE __m128i MulDiv16(__m128i dividend, __m128i divisor, const __m128 & KF_255_DIV_6)
			const __m128i quotientLo = MulDiv32(_mm_unpacklo_epi16(dividend, K_ZERO), _mm_unpacklo_epi16(divisor, K_ZERO), KF_255_DIV_6);
			const __m128i quotientHi = MulDiv32(_mm_unpackhi_epi16(dividend, K_ZERO), _mm_unpackhi_epi16(divisor, K_ZERO), KF_255_DIV_6);
			return _mm_packs_epi32(quotientLo, quotientHi);
Exemple #22
// Simple quantization
static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16],
                             int n, const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(2047);
  const __m128i zero = _mm_set1_epi16(0);
  __m128i sign0, sign8;
  __m128i coeff0, coeff8;
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  // TODO(cduvivier): Make variable declarations and allocations aligned so that
  //                  we can use _mm_load_si128 instead of _mm_loadu_si128.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]);
  const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]);
  const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]);
  const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]);
  const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]);
  const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]);
  const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]);
  const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]);

  // sign(in) = in >> 15  (0x0000 if positive, 0xffff if negative)
  sign0 = _mm_srai_epi16(in0, 15);
  sign8 = _mm_srai_epi16(in8, 15);

  // coeff = abs(in) = (in ^ sign) - sign
  coeff0 = _mm_xor_si128(in0, sign0);
  coeff8 = _mm_xor_si128(in8, sign8);
  coeff0 = _mm_sub_epi16(coeff0, sign0);
  coeff8 = _mm_sub_epi16(coeff8, sign8);

  // coeff = abs(in) + sharpen
  coeff0 = _mm_add_epi16(coeff0, sharpen0);
  coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // if (coeff > 2047) coeff = 2047
  coeff0 = _mm_min_epi16(coeff0, max_coeff_2047);
  coeff8 = _mm_min_epi16(coeff8, max_coeff_2047);

  // out = (coeff * iQ + B) >> QFIX;
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // expand bias from 16b to 32b
    __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero);
    __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero);
    __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero);
    __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero);
    // out = (coeff * iQ + B)
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = (coeff * iQ + B) >> QFIX;
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);
    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);

  // get sign back (if (sign[j]) out_n = -out_n)
  out0 = _mm_xor_si128(out0, sign0);
  out8 = _mm_xor_si128(out8, sign8);
  out0 = _mm_sub_epi16(out0, sign0);
  out8 = _mm_sub_epi16(out8, sign8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  // if (coeff <= mtx->zthresh_) {in=0; out=0;}
    __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0);
    __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8);
    in0 = _mm_and_si128(in0, cmp0);
    in8 = _mm_and_si128(in8, cmp8);
    _mm_storeu_si128((__m128i*)&in[0], in0);
    _mm_storeu_si128((__m128i*)&in[8], in8);
    out0 = _mm_and_si128(out0, cmp0);
    out8 = _mm_and_si128(out8, cmp8);

  // zigzag the output before storing it.
  // The zigzag pattern can almost be reproduced with a small sequence of
  // shuffles. After it, we only need to swap the 7th (ending up in third
  // position instead of twelfth) and 8th values.
    __m128i outZ0, outZ8;
    outZ0 = _mm_shufflehi_epi16(out0,  _MM_SHUFFLE(2, 1, 3, 0));
    outZ0 = _mm_shuffle_epi32  (outZ0, _MM_SHUFFLE(3, 1, 2, 0));
    outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2));
    outZ8 = _mm_shufflelo_epi16(out8,  _MM_SHUFFLE(3, 0, 2, 1));
    outZ8 = _mm_shuffle_epi32  (outZ8, _MM_SHUFFLE(3, 1, 2, 0));
    outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0));
    _mm_storeu_si128((__m128i*)&out[0], outZ0);
    _mm_storeu_si128((__m128i*)&out[8], outZ8);
    packed_out = _mm_packs_epi16(outZ0, outZ8);
    const int16_t outZ_12 = out[12];
    const int16_t outZ_3 = out[3];
    out[3] = outZ_12;
    out[12] = outZ_3;

  // detect if all 'out' values are zeroes or not
    int32_t tmp[4];
    _mm_storeu_si128((__m128i*)tmp, packed_out);
    if (n) {
      tmp[0] &= ~0xff;
    return (tmp[3] || tmp[2] || tmp[1] || tmp[0]);
Exemple #23
void aom_highbd_comp_avg_upsampled_pred_sse2(uint16_t *comp_pred,
                                             const uint8_t *pred8, int width,
                                             int height, const uint8_t *ref8,
                                             const int ref_stride) {
  const __m128i one = _mm_set1_epi16(1);
  const int stride = ref_stride << 3;
  int i, j;
  uint16_t *pred = CONVERT_TO_SHORTPTR(pred8);
  uint16_t *ref = CONVERT_TO_SHORTPTR(ref8);

  if (width >= 8) {
    // read 8 points at one time
    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j += 8) {
        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
        __m128i s4 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 32));
        __m128i s5 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 40));
        __m128i s6 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 48));
        __m128i s7 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 56));
        __m128i p0 = _mm_loadu_si128((const __m128i *)pred);
        __m128i t0, t1, t2, t3;

        t0 = _mm_unpacklo_epi16(s0, s1);
        t1 = _mm_unpacklo_epi16(s2, s3);
        t2 = _mm_unpacklo_epi16(s4, s5);
        t3 = _mm_unpacklo_epi16(s6, s7);
        t0 = _mm_unpacklo_epi32(t0, t1);
        t2 = _mm_unpacklo_epi32(t2, t3);
        t0 = _mm_unpacklo_epi64(t0, t2);

        p0 = _mm_adds_epu16(t0, p0);
        p0 = _mm_adds_epu16(p0, one);
        p0 = _mm_srli_epi16(p0, 1);

        _mm_storeu_si128((__m128i *)(comp_pred), p0);
        comp_pred += 8;
        pred += 8;
        ref += 8 * 8;
      ref += stride - (width << 3);
  } else {
    // read 4 points at one time
    for (i = 0; i < height; i++) {
      for (j = 0; j < width; j += 4) {
        __m128i s0 = _mm_cvtsi32_si128(*(const uint32_t *)ref);
        __m128i s1 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 8));
        __m128i s2 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 16));
        __m128i s3 = _mm_cvtsi32_si128(*(const uint32_t *)(ref + 24));
        __m128i p0 = _mm_loadl_epi64((const __m128i *)pred);
        __m128i t0, t1;

        t0 = _mm_unpacklo_epi16(s0, s1);
        t1 = _mm_unpacklo_epi16(s2, s3);
        t0 = _mm_unpacklo_epi32(t0, t1);

        p0 = _mm_adds_epu16(t0, p0);
        p0 = _mm_adds_epu16(p0, one);
        p0 = _mm_srli_epi16(p0, 1);

        _mm_storel_epi64((__m128i *)(comp_pred), p0);
        comp_pred += 4;
        pred += 4;
        ref += 4 * 8;
      ref += stride - (width << 3);
Exemple #24
rfx_dwt_2d_decode_block_horiz_sse2(INT16* l, INT16* h, INT16* dst, int subband_width)
	int y, n;
	INT16* l_ptr = l;
	INT16* h_ptr = h;
	INT16* dst_ptr = dst;
	int first;
	int last;
	__m128i l_n;
	__m128i h_n;
	__m128i h_n_m;
	__m128i tmp_n;
	__m128i dst_n;
	__m128i dst_n_p;
	__m128i dst1;
	__m128i dst2;

	for (y = 0; y < subband_width; y++)
		/* Even coefficients */
		for (n = 0; n < subband_width; n += 8)
			/* dst[2n] = l[n] - ((h[n-1] + h[n] + 1) >> 1); */
			l_n = _mm_load_si128((__m128i*) l_ptr);

			h_n = _mm_load_si128((__m128i*) h_ptr);
			h_n_m = _mm_loadu_si128((__m128i*) (h_ptr - 1));

			if (n == 0)
				first = _mm_extract_epi16(h_n_m, 1);
				h_n_m = _mm_insert_epi16(h_n_m, first, 0);
			tmp_n = _mm_add_epi16(h_n, h_n_m);
			tmp_n = _mm_add_epi16(tmp_n, _mm_set1_epi16(1));
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			dst_n = _mm_sub_epi16(l_n, tmp_n);
			_mm_store_si128((__m128i*) l_ptr, dst_n);
			l_ptr += 8;
			h_ptr += 8;

		l_ptr -= subband_width;
		h_ptr -= subband_width;
		/* Odd coefficients */
		for (n = 0; n < subband_width; n += 8)
			/* dst[2n + 1] = (h[n] << 1) + ((dst[2n] + dst[2n + 2]) >> 1); */
			h_n = _mm_load_si128((__m128i*) h_ptr);
			h_n = _mm_slli_epi16(h_n, 1);
			dst_n = _mm_load_si128((__m128i*) (l_ptr));
			dst_n_p = _mm_loadu_si128((__m128i*) (l_ptr + 1));

			if (n == subband_width - 8)
				last = _mm_extract_epi16(dst_n_p, 6);
				dst_n_p = _mm_insert_epi16(dst_n_p, last, 7);
			tmp_n = _mm_add_epi16(dst_n_p, dst_n);
			tmp_n = _mm_srai_epi16(tmp_n, 1);
			tmp_n = _mm_add_epi16(tmp_n, h_n);
			dst1 = _mm_unpacklo_epi16(dst_n, tmp_n);
			dst2 = _mm_unpackhi_epi16(dst_n, tmp_n);
			_mm_store_si128((__m128i*) dst_ptr, dst1);
			_mm_store_si128((__m128i*) (dst_ptr + 8), dst2);
			l_ptr += 8;
			h_ptr += 8;
			dst_ptr += 16;
Exemple #25
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
                                             const uint16_t* const sharpen,
                                             const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

  // coeff = abs(in)
  __m128i coeff0 = _mm_abs_epi16(in0);
  __m128i coeff8 = _mm_abs_epi16(in8);

  // coeff = abs(in) + sharpen
  if (sharpen != NULL) {
    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
    coeff0 = _mm_add_epi16(coeff0, sharpen0);
    coeff8 = _mm_add_epi16(coeff8, sharpen8);

  // out = (coeff * iQ + B) >> QFIX
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // out = (coeff * iQ + B)
    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = QUANTDIV(coeff, iQ, B, QFIX)
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);

    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);

    // if (coeff > 2047) coeff = 2047
    out0 = _mm_min_epi16(out0, max_coeff_2047);
    out8 = _mm_min_epi16(out8, max_coeff_2047);

  // put sign back
  out0 = _mm_sign_epi16(out0, in0);
  out8 = _mm_sign_epi16(out8, in8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  _mm_storeu_si128((__m128i*)&in[0], in0);
  _mm_storeu_si128((__m128i*)&in[8], in8);

  // zigzag the output before storing it. The re-ordering is:
  //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
  // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
  // There's only two misplaced entries ([8] and [7]) that are crossing the
  // reg's boundaries.
  // We use pshufb instead of pshuflo/pshufhi.
    const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
    const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
    const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
    const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
    const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
    const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
    const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
    const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
    const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
    const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
    _mm_storeu_si128((__m128i*)&out[0], out_z0);
    _mm_storeu_si128((__m128i*)&out[8], out_z8);
    packed_out = _mm_packs_epi16(out_z0, out_z8);

  // detect if all 'out' values are zeroes or not
  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
Exemple #26
static void tranpose8x8(__m128i *input,int i_indx, __m128i *Transposed,int t_indx)
__m128i a;
__m128i b;
__m128i c;
__m128i d;
__m128i e;
__m128i f;
__m128i g;
__m128i h;

__m128i temp1;
__m128i temp2;
__m128i temp3;
__m128i temp4;
__m128i temp5;
__m128i temp6;
__m128i temp7;
__m128i temp8;
__m128i temp9;
__m128i temp10;
__m128i temp11;
__m128i temp12;
__m128i temp13;
__m128i temp14;
__m128i temp15;
__m128i temp16;
__m128i T0;
__m128i T1;
__m128i T2;
__m128i T3;
__m128i T4;
__m128i T5;
__m128i T6;
__m128i T7;

a = _mm_load_si128(&input[i_indx]);
b = _mm_load_si128(&input[i_indx+4 ]);
c = _mm_load_si128(&input[i_indx+8 ]);
d = _mm_load_si128(&input[i_indx+12]);
e = _mm_load_si128(&input[i_indx+16]);
f = _mm_load_si128(&input[i_indx+20]);
g = _mm_load_si128(&input[i_indx+24]);
h = _mm_load_si128(&input[i_indx+28]);

 temp1 = _mm_unpacklo_epi16(a, b); //a03b03
 temp2 = _mm_unpacklo_epi16(c, d);
 temp3 = _mm_unpacklo_epi16(e, f);
 temp4 = _mm_unpacklo_epi16(g, h);
 temp5 = _mm_unpackhi_epi16(a, b);
 temp6 = _mm_unpackhi_epi16(c, d);
 temp7 = _mm_unpackhi_epi16(e, f);
 temp8 = _mm_unpackhi_epi16(g, h);

 temp9 = _mm_unpacklo_epi32(temp1, temp2); //a01b01c01d01
 temp10 = _mm_unpackhi_epi32(temp1, temp2);
 temp11 = _mm_unpacklo_epi32(temp3, temp4);
 temp12 = _mm_unpackhi_epi32(temp3, temp4);
 temp13 = _mm_unpacklo_epi32(temp5, temp6);
 temp14 = _mm_unpackhi_epi32(temp5, temp6);
 temp15 = _mm_unpacklo_epi32(temp7, temp8);
 temp16 = _mm_unpackhi_epi32(temp7, temp8);
 T0 = _mm_unpacklo_epi64(temp9, temp11);  //a0b0c0d0e0f0g0h0
 T1 = _mm_unpackhi_epi64(temp9, temp11);
 T2 = _mm_unpacklo_epi64(temp10, temp12);
 T3 = _mm_unpackhi_epi64(temp10, temp12);
 T4 = _mm_unpacklo_epi64(temp13, temp15);
 T5 = _mm_unpackhi_epi64(temp13, temp15);
 T6 = _mm_unpacklo_epi64(temp14, temp16);
 T7 = _mm_unpackhi_epi64(temp14, temp16);

_mm_store_si128(&Transposed[t_indx], T0);   //store transposed 8X8 matrix
_mm_store_si128(&Transposed[t_indx+1], T1);
_mm_store_si128(&Transposed[t_indx+2], T2);
_mm_store_si128(&Transposed[t_indx+3], T3);
_mm_store_si128(&Transposed[t_indx+4], T4);
_mm_store_si128(&Transposed[t_indx+5], T5);
_mm_store_si128(&Transposed[t_indx+6], T6);
_mm_store_si128(&Transposed[t_indx+7], T7);

Exemple #27
/* Input image must be large enough to have valid pixels for the offset (dx,dy).
   I.e., with (dx,dy)=(-10,8), x-value up to -10 and y-values up to (h-1)+8 will be accessed.
   The integral image will be access with (x,y) in [-1,w)x[-1,h).

   Note also that we use 32bit for the integral image even though the values may overflow
   that range. However, the modulo-arithmetic used when computing the block sums later
   will be still correct when the block size is not too large.
static void buildIntegralImage_SSE(uint32_t* integral_image, int integral_stride,
                                   const uint8_t* current_image, int current_image_stride,
				   const uint8_t* compare_image, int compare_image_stride,
				   int  w,int  hStart, int hEnd, 
				   int dx,int dy)
    const __m128i zero = _mm_set1_epi8(0);

    memset(integral_image -1 -integral_stride, 0, (w+1)*sizeof(uint32_t));

    for (int y=hStart;y<hEnd;y++) {
        const uint8_t* p1 = current_image +  y    *current_image_stride;
        const uint8_t* p2 = compare_image + (y+dy)*compare_image_stride + dx;

        uint32_t* out = integral_image + y*integral_stride-1;

        __m128i prevadd = _mm_set1_epi32(0);
        const int pixels_step = 16;

        *out++ = 0;

        for (int x=0 ; x<w ; x+=pixels_step)
            __m128i pa, pb;
            __m128i pla, plb;
            __m128i ldiff, lldiff, lhdiff;
            __m128i ltmp,htmp;
            __m128i ladd,hadd;
            __m128i pha,phb;
            __m128i hdiff,hldiff,hhdiff;
            __m128i l2tmp,h2tmp;

            pa = _mm_loadu_si128((__m128i*)p1);
            pb = _mm_loadu_si128((__m128i*)p2);

            pla = _mm_unpacklo_epi8(pa,zero);
            plb = _mm_unpacklo_epi8(pb,zero);

            ldiff = _mm_sub_epi16(pla,plb);
            ldiff = _mm_mullo_epi16(ldiff,ldiff);

            lldiff = _mm_unpacklo_epi16(ldiff,zero);
            lhdiff = _mm_unpackhi_epi16(ldiff,zero);

            ltmp = _mm_slli_si128(lldiff, 4);
            lldiff = _mm_add_epi32(lldiff, ltmp);
            ltmp = _mm_slli_si128(lldiff, 8);
            lldiff = _mm_add_epi32(lldiff, ltmp);
            lldiff = _mm_add_epi32(lldiff, prevadd);

            ladd = _mm_shuffle_epi32(lldiff, 0xff);

            htmp = _mm_slli_si128(lhdiff, 4);
            lhdiff = _mm_add_epi32(lhdiff, htmp);
            htmp = _mm_slli_si128(lhdiff, 8);
            lhdiff = _mm_add_epi32(lhdiff, htmp);
            lhdiff = _mm_add_epi32(lhdiff, ladd);

            prevadd = _mm_shuffle_epi32(lhdiff, 0xff);

            _mm_store_si128((__m128i*)(out),  lldiff);

            pha = _mm_unpackhi_epi8(pa,zero);
            phb = _mm_unpackhi_epi8(pb,zero);
            hdiff = _mm_sub_epi16(pha,phb);

            hdiff = _mm_mullo_epi16(hdiff,hdiff);

            hldiff = _mm_unpacklo_epi16(hdiff,zero);
            hhdiff = _mm_unpackhi_epi16(hdiff,zero);
            l2tmp = _mm_slli_si128(hldiff, 4);
            hldiff = _mm_add_epi32(hldiff, l2tmp);
            l2tmp = _mm_slli_si128(hldiff, 8);
            hldiff = _mm_add_epi32(hldiff, l2tmp);
            hldiff = _mm_add_epi32(hldiff, prevadd);
            hadd = _mm_shuffle_epi32(hldiff, 0xff);
            h2tmp = _mm_slli_si128(hhdiff, 4);
            hhdiff = _mm_add_epi32(hhdiff, h2tmp);
            h2tmp = _mm_slli_si128(hhdiff, 8);
            hhdiff = _mm_add_epi32(hhdiff, h2tmp);
            hhdiff = _mm_add_epi32(hhdiff, hadd);

            prevadd = _mm_shuffle_epi32(hhdiff, 0xff);

            _mm_store_si128((__m128i*)(out+8), hldiff);

            p1 +=pixels_step;
            p2 +=pixels_step;

        if (y>0) {
            out = integral_image + y*integral_stride;

            for (int x=0 ; x<w ; x+=pixels_step) {
                *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),

                *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride),

                *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride),

                *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),

                out += 4*4;
Exemple #28
static void trans_g_aiT16(__m128i *input, __m128i *Transposed)

__m128i a;
__m128i b;
__m128i c;
__m128i d;
__m128i e;
__m128i f;
__m128i g;
__m128i h;

__m128i temp1;
__m128i temp2;
__m128i temp3;
__m128i temp4;
__m128i temp5;
__m128i temp6;
__m128i temp7;
__m128i temp8;
__m128i temp9;
__m128i temp10;
__m128i temp11;
__m128i temp12;
__m128i temp13;
__m128i temp14;
__m128i temp15;
__m128i temp16;
__m128i T0;
__m128i T1;
__m128i T2;
__m128i T3;
__m128i T4;
__m128i T5;
__m128i T6;
__m128i T7;

a = _mm_load_si128(&input[2]);
b = _mm_load_si128(&input[6]);
c = _mm_load_si128(&input[10]);
d = _mm_load_si128(&input[14]);
e = _mm_load_si128(&input[18]);
f = _mm_load_si128(&input[22]);
g = _mm_load_si128(&input[26]);
h = _mm_load_si128(&input[30]);

//store 128 bits of integer data into the memory address given
_mm_store_si128(&Transposed[0], a);   //store transposed 8X8 matrix
_mm_store_si128(&Transposed[1], b);
_mm_store_si128(&Transposed[2], c);
_mm_store_si128(&Transposed[3], d);
_mm_store_si128(&Transposed[4], e);
_mm_store_si128(&Transposed[5], f);
_mm_store_si128(&Transposed[6], g);
_mm_store_si128(&Transposed[7], h);

//load matrix input[0][0],[2][0]...

 a = _mm_load_si128(&input[0]);
 b = _mm_load_si128(&input[4]);
 c = _mm_load_si128(&input[8]);
 d = _mm_load_si128(&input[12]);
 e = _mm_load_si128(&input[16]);
 f = _mm_load_si128(&input[20]);
 g = _mm_load_si128(&input[24]);
 h = _mm_load_si128(&input[28]);

 temp1 = _mm_unpacklo_epi16(a, b);
 temp2 = _mm_unpacklo_epi16(c, d);
 temp3 = _mm_unpacklo_epi16(e, f);
 temp4 = _mm_unpacklo_epi16(g, h);
 temp5 = _mm_unpackhi_epi16(a, b);
 temp6 = _mm_unpackhi_epi16(c, d);
 temp7 = _mm_unpackhi_epi16(e, f);
 temp8 = _mm_unpackhi_epi16(g, h);

 temp9 = _mm_unpacklo_epi32(temp1, temp2);
 temp10 = _mm_unpackhi_epi32(temp1, temp2);
 temp11 = _mm_unpacklo_epi32(temp3, temp4);
 temp12 = _mm_unpackhi_epi32(temp3, temp4);
 temp13 = _mm_unpacklo_epi32(temp5, temp6);
 temp14 = _mm_unpackhi_epi32(temp5, temp6);
 temp15 = _mm_unpacklo_epi32(temp7, temp8);
 temp16 = _mm_unpackhi_epi32(temp7, temp8);

 T0 = _mm_unpacklo_epi64(temp9, temp11);
 T1 = _mm_unpackhi_epi64(temp9, temp11);
 T2 = _mm_unpacklo_epi64(temp10, temp12);
 T3 = _mm_unpackhi_epi64(temp10, temp12);
_mm_store_si128(&Transposed[8], T0);   //store transposed 8X8 matrix
_mm_store_si128(&Transposed[9], T1);
_mm_store_si128(&Transposed[10], T2);
_mm_store_si128(&Transposed[11], T3);

Exemple #29
static void FTransform(const uint8_t* src, const uint8_t* ref, int16_t* out) {
  const __m128i zero = _mm_setzero_si128();
  const __m128i seven = _mm_set1_epi16(7);
  const __m128i k937 = _mm_set1_epi32(937);
  const __m128i k1812 = _mm_set1_epi32(1812);
  const __m128i k51000 = _mm_set1_epi32(51000);
  const __m128i k12000_plus_one = _mm_set1_epi32(12000 + (1 << 16));
  const __m128i k5352_2217 = _mm_set_epi16(5352,  2217, 5352,  2217,
                                           5352,  2217, 5352,  2217);
  const __m128i k2217_5352 = _mm_set_epi16(2217, -5352, 2217, -5352,
                                           2217, -5352, 2217, -5352);
  const __m128i k88p = _mm_set_epi16(8, 8, 8, 8, 8, 8, 8, 8);
  const __m128i k88m = _mm_set_epi16(-8, 8, -8, 8, -8, 8, -8, 8);
  const __m128i k5352_2217p = _mm_set_epi16(2217, 5352, 2217, 5352,
                                            2217, 5352, 2217, 5352);
  const __m128i k5352_2217m = _mm_set_epi16(-5352, 2217, -5352, 2217,
                                            -5352, 2217, -5352, 2217);
  __m128i v01, v32;

  // Difference between src and ref and initial transpose.
    // Load src and convert to 16b.
    const __m128i src0 = _mm_loadl_epi64((const __m128i*)&src[0 * BPS]);
    const __m128i src1 = _mm_loadl_epi64((const __m128i*)&src[1 * BPS]);
    const __m128i src2 = _mm_loadl_epi64((const __m128i*)&src[2 * BPS]);
    const __m128i src3 = _mm_loadl_epi64((const __m128i*)&src[3 * BPS]);
    const __m128i src_0 = _mm_unpacklo_epi8(src0, zero);
    const __m128i src_1 = _mm_unpacklo_epi8(src1, zero);
    const __m128i src_2 = _mm_unpacklo_epi8(src2, zero);
    const __m128i src_3 = _mm_unpacklo_epi8(src3, zero);
    // Load ref and convert to 16b.
    const __m128i ref0 = _mm_loadl_epi64((const __m128i*)&ref[0 * BPS]);
    const __m128i ref1 = _mm_loadl_epi64((const __m128i*)&ref[1 * BPS]);
    const __m128i ref2 = _mm_loadl_epi64((const __m128i*)&ref[2 * BPS]);
    const __m128i ref3 = _mm_loadl_epi64((const __m128i*)&ref[3 * BPS]);
    const __m128i ref_0 = _mm_unpacklo_epi8(ref0, zero);
    const __m128i ref_1 = _mm_unpacklo_epi8(ref1, zero);
    const __m128i ref_2 = _mm_unpacklo_epi8(ref2, zero);
    const __m128i ref_3 = _mm_unpacklo_epi8(ref3, zero);
    // Compute difference. -> 00 01 02 03 00 00 00 00
    const __m128i diff0 = _mm_sub_epi16(src_0, ref_0);
    const __m128i diff1 = _mm_sub_epi16(src_1, ref_1);
    const __m128i diff2 = _mm_sub_epi16(src_2, ref_2);
    const __m128i diff3 = _mm_sub_epi16(src_3, ref_3);

    // Unpack and shuffle
    // 00 01 02 03   0 0 0 0
    // 10 11 12 13   0 0 0 0
    // 20 21 22 23   0 0 0 0
    // 30 31 32 33   0 0 0 0
    const __m128i shuf01 = _mm_unpacklo_epi32(diff0, diff1);
    const __m128i shuf23 = _mm_unpacklo_epi32(diff2, diff3);
    // 00 01 10 11 02 03 12 13
    // 20 21 30 31 22 23 32 33
    const __m128i shuf01_p =
        _mm_shufflehi_epi16(shuf01, _MM_SHUFFLE(2, 3, 0, 1));
    const __m128i shuf23_p =
        _mm_shufflehi_epi16(shuf23, _MM_SHUFFLE(2, 3, 0, 1));
    // 00 01 10 11 03 02 13 12
    // 20 21 30 31 23 22 33 32
    const __m128i s01 = _mm_unpacklo_epi64(shuf01_p, shuf23_p);
    const __m128i s32 = _mm_unpackhi_epi64(shuf01_p, shuf23_p);
    // 00 01 10 11 20 21 30 31
    // 03 02 13 12 23 22 33 32
    const __m128i a01 = _mm_add_epi16(s01, s32);
    const __m128i a32 = _mm_sub_epi16(s01, s32);
    // [d0 + d3 | d1 + d2 | ...] = [a0 a1 | a0' a1' | ... ]
    // [d0 - d3 | d1 - d2 | ...] = [a3 a2 | a3' a2' | ... ]

    const __m128i tmp0 = _mm_madd_epi16(a01, k88p);  // [ (a0 + a1) << 3, ... ]
    const __m128i tmp2 = _mm_madd_epi16(a01, k88m);  // [ (a0 - a1) << 3, ... ]
    const __m128i tmp1_1 = _mm_madd_epi16(a32, k5352_2217p);
    const __m128i tmp3_1 = _mm_madd_epi16(a32, k5352_2217m);
    const __m128i tmp1_2 = _mm_add_epi32(tmp1_1, k1812);
    const __m128i tmp3_2 = _mm_add_epi32(tmp3_1, k937);
    const __m128i tmp1   = _mm_srai_epi32(tmp1_2, 9);
    const __m128i tmp3   = _mm_srai_epi32(tmp3_2, 9);
    const __m128i s03 = _mm_packs_epi32(tmp0, tmp2);
    const __m128i s12 = _mm_packs_epi32(tmp1, tmp3);
    const __m128i s_lo = _mm_unpacklo_epi16(s03, s12);   // 0 1 0 1 0 1...
    const __m128i s_hi = _mm_unpackhi_epi16(s03, s12);   // 2 3 2 3 2 3
    const __m128i v23 = _mm_unpackhi_epi32(s_lo, s_hi);
    v01 = _mm_unpacklo_epi32(s_lo, s_hi);
    v32 = _mm_shuffle_epi32(v23, _MM_SHUFFLE(1, 0, 3, 2));  // 3 2 3 2 3 2..

  // Second pass
    // Same operations are done on the (0,3) and (1,2) pairs.
    // a0 = v0 + v3
    // a1 = v1 + v2
    // a3 = v0 - v3
    // a2 = v1 - v2
    const __m128i a01 = _mm_add_epi16(v01, v32);
    const __m128i a32 = _mm_sub_epi16(v01, v32);
    const __m128i a11 = _mm_unpackhi_epi64(a01, a01);
    const __m128i a22 = _mm_unpackhi_epi64(a32, a32);
    const __m128i a01_plus_7 = _mm_add_epi16(a01, seven);

    // d0 = (a0 + a1 + 7) >> 4;
    // d2 = (a0 - a1 + 7) >> 4;
    const __m128i c0 = _mm_add_epi16(a01_plus_7, a11);
    const __m128i c2 = _mm_sub_epi16(a01_plus_7, a11);
    const __m128i d0 = _mm_srai_epi16(c0, 4);
    const __m128i d2 = _mm_srai_epi16(c2, 4);

    // f1 = ((b3 * 5352 + b2 * 2217 + 12000) >> 16)
    // f3 = ((b3 * 2217 - b2 * 5352 + 51000) >> 16)
    const __m128i b23 = _mm_unpacklo_epi16(a22, a32);
    const __m128i c1 = _mm_madd_epi16(b23, k5352_2217);
    const __m128i c3 = _mm_madd_epi16(b23, k2217_5352);
    const __m128i d1 = _mm_add_epi32(c1, k12000_plus_one);
    const __m128i d3 = _mm_add_epi32(c3, k51000);
    const __m128i e1 = _mm_srai_epi32(d1, 16);
    const __m128i e3 = _mm_srai_epi32(d3, 16);
    const __m128i f1 = _mm_packs_epi32(e1, e1);
    const __m128i f3 = _mm_packs_epi32(e3, e3);
    // f1 = f1 + (a3 != 0);
    // The compare will return (0xffff, 0) for (==0, !=0). To turn that into the
    // desired (0, 1), we add one earlier through k12000_plus_one.
    // -> f1 = f1 + 1 - (a3 == 0)
    const __m128i g1 = _mm_add_epi16(f1, _mm_cmpeq_epi16(a32, zero));

    const __m128i d0_g1 = _mm_unpacklo_epi64(d0, g1);
    const __m128i d2_f3 = _mm_unpacklo_epi64(d2, f3);
    _mm_storeu_si128((__m128i*)&out[0], d0_g1);
    _mm_storeu_si128((__m128i*)&out[8], d2_f3);
 __m64 interpolvline128_3(__m128i* temp){

	__m128i xmm6;

	__m64 ret;

	__m128i xmm7 = _mm_setzero_si128();

	__m128i xmm0 = _mm_load_si128(temp++);
	__m128i xmm1 = _mm_load_si128(temp++);
	__m128i xmm2 = _mm_load_si128(temp++);
	__m128i xmm3 = _mm_load_si128(temp++);
	__m128i xmm4 = _mm_load_si128(temp++);
	__m128i xmm5 = _mm_load_si128(temp);

	xmm1 = _mm_add_epi16(xmm1,xmm4);
	xmm0 = _mm_add_epi16(xmm0,xmm5);

	xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB);

	xmm4 = _mm_mullo_epi16(xmm1, xmm6);
	xmm5 = _mm_mulhi_epi16(xmm1, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm6 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014);
	xmm5 = _mm_add_epi16(xmm2,xmm3);

	xmm4 = _mm_mullo_epi16(xmm5, xmm7);
	xmm5 = _mm_mulhi_epi16(xmm5, xmm7);

	xmm7 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm4 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 

	xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001);
	xmm6 = _mm_mulhi_epi16(xmm0, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm0, xmm6);
	xmm6 = _mm_unpackhi_epi16(xmm0, xmm6);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 
	xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200);
	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm1);
	xmm5 = _mm_setzero_si128();

	xmm7 = _mm_srli_epi32(xmm7, 10);
	xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values
	xmm7 = _mm_slli_epi32(xmm7,16);
	xmm7 = _mm_srli_epi32(xmm7,16);

	xmm4 = _mm_srli_epi32(xmm4, 10);

	xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values
	xmm4 = _mm_slli_epi32(xmm4,16);
	xmm4 = _mm_srli_epi32(xmm4,16);

	xmm6 = _mm_packs_epi32(xmm7, xmm4);
	xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010);
	xmm2 = _mm_add_epi16(xmm2,xmm1);
	xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values
	xmm2 = _mm_srli_epi16(xmm2,5);

	xmm3 = _mm_add_epi16(xmm3,xmm1);
	xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values
	xmm3 = _mm_srli_epi16(xmm3,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm5);
	xmm3 = _mm_packus_epi16(xmm3,xmm5);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);

	xmm7 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm7);

	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);
	ret = _mm_movepi64_pi64(xmm6);

