int32_t dot_product(int16_t *x,
                    int16_t *y,
                    uint32_t N, //must be a multiple of 8
                    uint8_t output_shift)
{

  uint32_t n;

#if defined(__x86_64__) || defined(__i386__)
  __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
  __m64 mmtmp7;
  __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
  int32_t result;

  x128 = (__m128i*) x;
  y128 = (__m128i*) y;

  mmcumul_re = _mm_setzero_si128();
  mmcumul_im = _mm_setzero_si128();

  for (n=0; n<(N>>2); n++) {

    //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
    //    print_shorts("x",&x128[0]);
    //    print_shorts("y",&y128[0]);

    // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
    mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
    //    print_ints("re",&mmtmp1);
    // mmtmp1 contains real part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
    mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
    //    print_ints("re",&mmcumul_re);


    // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
    mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
    //        print_shorts("y",&mmtmp2);

    mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
    //        print_ints("im",&mmtmp3);
    // mmtmp3 contains imag part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
    mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
    //    print_ints("im",&mmcumul_im);

    x128++;
    y128++;
  }

  // this gives Re Re Im Im
  mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
  //  print_ints("cumul1",&mmcumul);

  // this gives Re Im Re Im
  mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);

  //  print_ints("cumul2",&mmcumul);


  //mmcumul = _mm_srai_epi32(mmcumul,output_shift);
  // extract the lower half
  mmtmp7 = _mm_movepi64_pi64(mmcumul);
  //  print_ints("mmtmp7",&mmtmp7);
  // pack the result
  mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7);
  //  print_shorts("mmtmp7",&mmtmp7);
  // convert back to integer
  result = _mm_cvtsi64_si32(mmtmp7);

  _mm_empty();
  _m_empty();

  return(result);

#elif defined(__arm__)
  int16x4_t *x_128=(int16x4_t*)x;
  int16x4_t *y_128=(int16x4_t*)y;
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int32x4_t re_cumul,im_cumul;
  int32x2_t re_cumul2,im_cumul2;
  int32x4_t shift = vdupq_n_s32(-output_shift); 
  int32x2x2_t result2;
  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;

  re_cumul = vdupq_n_s32(0);
  im_cumul = vdupq_n_s32(0); 

  for (n=0; n<(N>>2); n++) {

    tmp_re  = vmull_s16(*x_128++, *y_128++);
    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
    tmp_re1 = vmull_s16(*x_128++, *y_128++);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
  }
  
  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
  result2   = vzip_s32(re_cumul2,im_cumul2);
  return(vget_lane_s32(result2.val[0],0));
#endif
}
Beispiel #2
0
OD_SIMD_INLINE od_m256i od_mm256_add_epi32(od_m256i a, od_m256i b) {
  od_m256i r;
  r.lo = _mm_add_epi32(a.lo, b.lo);
  r.hi = _mm_add_epi32(a.hi, b.hi);
  return r;
}
Beispiel #3
0
inline static void
comp_func_helper_sse2(uint32_t *dest, int length, uint32_t color, uint32_t alpha)
{
   const __m128i v_color = _mm_set1_epi32(color);
   const __m128i v_a = _mm_set1_epi16(alpha);

   LOOP_ALIGNED_U1_A4(dest, length,
      { /* UOP */
         *dest = color + DRAW_BYTE_MUL(*dest, alpha);
         dest++; length--;
      },
      { /* A4OP */
         __m128i v_dest = _mm_load_si128((__m128i *)dest);

         v_dest = v4_byte_mul_sse2(v_dest, v_a);
         v_dest = _mm_add_epi32(v_dest, v_color);

         _mm_store_si128((__m128i *)dest, v_dest);

         dest += 4; length -= 4;
      })
}

void
comp_func_solid_source_sse2(uint32_t *dest, int length, uint32_t color, uint32_t const_alpha)
{
   if (const_alpha == 255)
     {
        draw_memset32(dest, color, length);
     }
   else
static inline void xor_salsa8_sse2(__m128i B[4], const __m128i Bx[4])
{
	__m128i X0, X1, X2, X3;
	__m128i T;
	int i;

	X0 = B[0] = _mm_xor_si128(B[0], Bx[0]);
	X1 = B[1] = _mm_xor_si128(B[1], Bx[1]);
	X2 = B[2] = _mm_xor_si128(B[2], Bx[2]);
	X3 = B[3] = _mm_xor_si128(B[3], Bx[3]);

	for (i = 0; i < 8; i += 2) {
		/* Operate on "columns". */
		T = _mm_add_epi32(X0, X3);
		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
		T = _mm_add_epi32(X1, X0);
		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
		T = _mm_add_epi32(X2, X1);
		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
		T = _mm_add_epi32(X3, X2);
		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

		/* Rearrange data. */
		X1 = _mm_shuffle_epi32(X1, 0x93);
		X2 = _mm_shuffle_epi32(X2, 0x4E);
		X3 = _mm_shuffle_epi32(X3, 0x39);

		/* Operate on "rows". */
		T = _mm_add_epi32(X0, X1);
		X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
		X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
		T = _mm_add_epi32(X3, X0);
		X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
		X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
		T = _mm_add_epi32(X2, X3);
		X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
		X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
		T = _mm_add_epi32(X1, X2);
		X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
		X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

		/* Rearrange data. */
		X1 = _mm_shuffle_epi32(X1, 0x39);
		X2 = _mm_shuffle_epi32(X2, 0x4E);
		X3 = _mm_shuffle_epi32(X3, 0x93);
	}

	B[0] = _mm_add_epi32(B[0], X0);
	B[1] = _mm_add_epi32(B[1], X1);
	B[2] = _mm_add_epi32(B[2], X2);
	B[3] = _mm_add_epi32(B[3], X3);
}
Beispiel #5
0
void av1_fadst4_new_sse4_1(const __m128i *input, __m128i *output,
                           const int8_t *cos_bit, const int8_t *stage_range) {
  const int txfm_size = 4;
  const int num_per_128 = 4;
  const int32_t *cospi;
  __m128i buf0[4];
  __m128i buf1[4];
  int col_num = txfm_size / num_per_128;
  int bit;
  int col;
  (void)stage_range;
  for (col = 0; col < col_num; col++) {
    // stage 0;
    int32_t stage_idx = 0;
    buf0[0] = input[0 * col_num + col];
    buf0[1] = input[1 * col_num + col];
    buf0[2] = input[2 * col_num + col];
    buf0[3] = input[3 * col_num + col];

    // stage 1
    stage_idx++;
    buf1[0] = buf0[3];
    buf1[1] = buf0[0];
    buf1[2] = buf0[1];
    buf1[3] = buf0[2];

    // stage 2
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[0], buf1[1], buf0[0], buf0[1],
                        bit);
    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[2], buf1[3], buf0[2],
                        buf0[3], bit);

    // stage 3
    stage_idx++;
    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);

    // stage 4
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    buf0[0] = buf1[0];
    buf0[1] = buf1[1];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
                        buf0[3], bit);

    // stage 5
    stage_idx++;
    buf1[0] = buf0[0];
    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
    buf1[2] = buf0[3];
    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);

    output[0 * col_num + col] = buf1[0];
    output[1 * col_num + col] = buf1[1];
    output[2 * col_num + col] = buf1[2];
    output[3 * col_num + col] = buf1[3];
  }
}
Beispiel #6
0
void ENC_MSG_x8(const unsigned char *PT,
                      unsigned char *CT,
                      const unsigned char *TAG,
                      const unsigned char *KS,
                      int length)
{
    __m128i or_mask, TWO,ctr_block, tmp, tmp1, tmp2, tmp3, tmp4, tmp5, tmp6, tmp7, ONE;
    int i,j,remainder_loc;
    if (length%16)
        length = length/16 + 1;
    else length/=16;
    ONE = _mm_setr_epi32(1,0,0,0);
	TWO = _mm_setr_epi32(2,0,0,0);
	ctr_block = _mm_setzero_si128();
	ctr_block = _mm_loadu_si128(((__m128i*)TAG));
	or_mask = _mm_setr_epi32(0,0,0,0x80000000);
	ctr_block = _mm_or_si128(ctr_block, or_mask);
	for (i=0; i< (length-length%8); i=i+8)
	{
		tmp = ctr_block;
		tmp1 = _mm_add_epi32(ctr_block, ONE);
		tmp2 = _mm_add_epi32(ctr_block, TWO);
		tmp3 = _mm_add_epi32(tmp2, ONE);
		tmp4 = _mm_add_epi32(tmp2, TWO);
		tmp5 = _mm_add_epi32(tmp4, ONE);
		tmp6 = _mm_add_epi32(tmp4, TWO);
		tmp7 = _mm_add_epi32(tmp6, ONE);
		ctr_block = _mm_add_epi32(tmp6, TWO);
		tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]);
		tmp1 = _mm_xor_si128(tmp1, ((__m128i*)KS)[0]);
		tmp2 = _mm_xor_si128(tmp2, ((__m128i*)KS)[0]);
		tmp3 = _mm_xor_si128(tmp3, ((__m128i*)KS)[0]);
		tmp4 = _mm_xor_si128(tmp4, ((__m128i*)KS)[0]);
		tmp5 = _mm_xor_si128(tmp5, ((__m128i*)KS)[0]);
		tmp6 = _mm_xor_si128(tmp6, ((__m128i*)KS)[0]);
		tmp7 = _mm_xor_si128(tmp7, ((__m128i*)KS)[0]);
			for(j=1; j <10; j++) {
				tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]);
				tmp1 = _mm_aesenc_si128 (tmp1, ((__m128i*)KS)[j]);
				tmp2 = _mm_aesenc_si128 (tmp2, ((__m128i*)KS)[j]);
				tmp3 = _mm_aesenc_si128 (tmp3, ((__m128i*)KS)[j]);
				tmp4 = _mm_aesenc_si128 (tmp4, ((__m128i*)KS)[j]);
				tmp5 = _mm_aesenc_si128 (tmp5, ((__m128i*)KS)[j]);
				tmp6 = _mm_aesenc_si128 (tmp6, ((__m128i*)KS)[j]);
				tmp7 = _mm_aesenc_si128 (tmp7, ((__m128i*)KS)[j]);
				};
			tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]);
			tmp1 = _mm_aesenclast_si128 (tmp1, ((__m128i*)KS)[j]);
			tmp2 = _mm_aesenclast_si128 (tmp2, ((__m128i*)KS)[j]);
			tmp3 = _mm_aesenclast_si128 (tmp3, ((__m128i*)KS)[j]);
			tmp4 = _mm_aesenclast_si128 (tmp4, ((__m128i*)KS)[j]);
			tmp5 = _mm_aesenclast_si128 (tmp5, ((__m128i*)KS)[j]);
			tmp6 = _mm_aesenclast_si128 (tmp6, ((__m128i*)KS)[j]);
			tmp7 = _mm_aesenclast_si128 (tmp7, ((__m128i*)KS)[j]);
			tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[i]));
			tmp1 = _mm_xor_si128(tmp1,_mm_loadu_si128(&((__m128i*)PT)[i+1]));
			tmp2 = _mm_xor_si128(tmp2,_mm_loadu_si128(&((__m128i*)PT)[i+2]));
			tmp3 = _mm_xor_si128(tmp3,_mm_loadu_si128(&((__m128i*)PT)[i+3]));
			tmp4 = _mm_xor_si128(tmp4,_mm_loadu_si128(&((__m128i*)PT)[i+4]));
			tmp5 = _mm_xor_si128(tmp5,_mm_loadu_si128(&((__m128i*)PT)[i+5]));
			tmp6 = _mm_xor_si128(tmp6,_mm_loadu_si128(&((__m128i*)PT)[i+6]));
			tmp7 = _mm_xor_si128(tmp7,_mm_loadu_si128(&((__m128i*)PT)[i+7]));
			_mm_storeu_si128(&((__m128i*)CT)[i],tmp);
			_mm_storeu_si128(&((__m128i*)CT)[i+1],tmp1);
			_mm_storeu_si128(&((__m128i*)CT)[i+2],tmp2);
			_mm_storeu_si128(&((__m128i*)CT)[i+3],tmp3);
			_mm_storeu_si128(&((__m128i*)CT)[i+4],tmp4);
			_mm_storeu_si128(&((__m128i*)CT)[i+5],tmp5);
			_mm_storeu_si128(&((__m128i*)CT)[i+6],tmp6);
			_mm_storeu_si128(&((__m128i*)CT)[i+7],tmp7);
		}
	// handling remainder and less than 8 blocks
	if (length%8==0)
		return;
	// The remainder_loc is used to remember the location of our block handled 
	remainder_loc = length-length%8;
    for(i=0; i < (length%8); i++)
	{
		tmp = ctr_block;
        ctr_block = _mm_add_epi32(ctr_block, ONE);
        tmp = _mm_xor_si128(tmp, ((__m128i*)KS)[0]);

            for(j=1; j <10; j++) {
                tmp = _mm_aesenc_si128 (tmp, ((__m128i*)KS)[j]);
                };
            tmp = _mm_aesenclast_si128 (tmp, ((__m128i*)KS)[j]);
            tmp = _mm_xor_si128(tmp,_mm_loadu_si128(&((__m128i*)PT)[remainder_loc+i]));
            _mm_storeu_si128 (&((__m128i*)CT)[remainder_loc+i],tmp);
    }
}
Beispiel #7
0
static void transform(hashState *state,int r)
{
  __m128i x0;
  __m128i x1;
  __m128i x2;
  __m128i x3;
  __m128i x4;
  __m128i x5;
  __m128i x6;
  __m128i x7;
  __m128i y0;
  __m128i y1;
  __m128i y2;
  __m128i y3;

  x0 = state->x[0];
  x1 = state->x[1];
  x2 = state->x[2];
  x3 = state->x[3];
  x4 = state->x[4];
  x5 = state->x[5];
  x6 = state->x[6];
  x7 = state->x[7];

  for (;r > 0;--r) {
    x4 = _mm_add_epi32(x0,x4);
    x5 = _mm_add_epi32(x1,x5);
    x6 = _mm_add_epi32(x2,x6);
    x7 = _mm_add_epi32(x3,x7);
    y0 = x2;
    y1 = x3;
    y2 = x0;
    y3 = x1;
    x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25));
    x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25));
    x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25));
    x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25));
    x0 = _mm_xor_si128(x0,x4);
    x1 = _mm_xor_si128(x1,x5);
    x2 = _mm_xor_si128(x2,x6);
    x3 = _mm_xor_si128(x3,x7);
    x4 = _mm_shuffle_epi32(x4,0x4e);
    x5 = _mm_shuffle_epi32(x5,0x4e);
    x6 = _mm_shuffle_epi32(x6,0x4e);
    x7 = _mm_shuffle_epi32(x7,0x4e);
    x4 = _mm_add_epi32(x0,x4);
    x5 = _mm_add_epi32(x1,x5);
    x6 = _mm_add_epi32(x2,x6);
    x7 = _mm_add_epi32(x3,x7);
    y0 = x1;
    y1 = x0;
    y2 = x3;
    y3 = x2;
    x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21));
    x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21));
    x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21));
    x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21));
    x0 = _mm_xor_si128(x0,x4);
    x1 = _mm_xor_si128(x1,x5);
    x2 = _mm_xor_si128(x2,x6);
    x3 = _mm_xor_si128(x3,x7);
    x4 = _mm_shuffle_epi32(x4,0xb1);
    x5 = _mm_shuffle_epi32(x5,0xb1);
    x6 = _mm_shuffle_epi32(x6,0xb1);
    x7 = _mm_shuffle_epi32(x7,0xb1);
  }

  state->x[0] = x0;
  state->x[1] = x1;
  state->x[2] = x2;
  state->x[3] = x3;
  state->x[4] = x4;
  state->x[5] = x5;
  state->x[6] = x6;
  state->x[7] = x7;
}
Beispiel #8
0
void spu_interpreter::BG(SPUThread& CPU, spu_opcode_t op)
{
	CPU.GPR[op.rt].vi = _mm_add_epi32(sse_cmpgt_epu32(CPU.GPR[op.ra].vi, CPU.GPR[op.rb].vi), _mm_set1_epi32(1));
}
Beispiel #9
0
template<> void momentsInTile<uchar, int, int>( const cv::Mat& img, double* moments )
{
    typedef uchar T;
    typedef int WT;
    typedef int MT;
    Size size = img.size();
    int y;
    MT mom[10] = {0,0,0,0,0,0,0,0,0,0};
    bool useSIMD = checkHardwareSupport(CV_CPU_SSE2);

    for( y = 0; y < size.height; y++ )
    {
        const T* ptr = img.ptr<T>(y);
        int x0 = 0, x1 = 0, x2 = 0, x3 = 0, x = 0;

        if( useSIMD )
        {
            __m128i qx_init = _mm_setr_epi16(0, 1, 2, 3, 4, 5, 6, 7);
            __m128i dx = _mm_set1_epi16(8);
            __m128i z = _mm_setzero_si128(), qx0 = z, qx1 = z, qx2 = z, qx3 = z, qx = qx_init;

            for( ; x <= size.width - 8; x += 8 )
            {
                __m128i p = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(ptr + x)), z);
                qx0 = _mm_add_epi32(qx0, _mm_sad_epu8(p, z));
                __m128i px = _mm_mullo_epi16(p, qx);
                __m128i sx = _mm_mullo_epi16(qx, qx);
                qx1 = _mm_add_epi32(qx1, _mm_madd_epi16(p, qx));
                qx2 = _mm_add_epi32(qx2, _mm_madd_epi16(p, sx));
                qx3 = _mm_add_epi32(qx3, _mm_madd_epi16(px, sx));

                qx = _mm_add_epi16(qx, dx);
            }
            int CV_DECL_ALIGNED(16) buf[4];
            _mm_store_si128((__m128i*)buf, qx0);
            x0 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx1);
            x1 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx2);
            x2 = buf[0] + buf[1] + buf[2] + buf[3];
            _mm_store_si128((__m128i*)buf, qx3);
            x3 = buf[0] + buf[1] + buf[2] + buf[3];
        }

        for( ; x < size.width; x++ )
        {
            WT p = ptr[x];
            WT xp = x * p, xxp;

            x0 += p;
            x1 += xp;
            xxp = xp * x;
            x2 += xxp;
            x3 += xxp * x;
        }

        WT py = y * x0, sy = y*y;

        mom[9] += ((MT)py) * sy;  // m03
        mom[8] += ((MT)x1) * sy;  // m12
        mom[7] += ((MT)x2) * y;  // m21
        mom[6] += x3;             // m30
        mom[5] += x0 * sy;        // m02
        mom[4] += x1 * y;         // m11
        mom[3] += x2;             // m20
        mom[2] += py;             // m01
        mom[1] += x1;             // m10
        mom[0] += x0;             // m00
    }

    for(int x = 0; x < 10; x++ )
        moments[x] = (double)mom[x];
}
Beispiel #10
0
void hashcat_md4_64 (__m128i digests[4], __m128i W[16])
{
  __m128i a = digests[0];
  __m128i b = digests[1];
  __m128i c = digests[2];
  __m128i d = digests[3];

  __m128i tmp2;

  MD4_STEP (MD4_F , a, b, c, d, W[ 0], MD4C00, MD4S00);
  MD4_STEP (MD4_F , d, a, b, c, W[ 1], MD4C00, MD4S01);
  MD4_STEP (MD4_F , c, d, a, b, W[ 2], MD4C00, MD4S02);
  MD4_STEP (MD4_F , b, c, d, a, W[ 3], MD4C00, MD4S03);
  MD4_STEP (MD4_F , a, b, c, d, W[ 4], MD4C00, MD4S00);
  MD4_STEP (MD4_F , d, a, b, c, W[ 5], MD4C00, MD4S01);
  MD4_STEP (MD4_F , c, d, a, b, W[ 6], MD4C00, MD4S02);
  MD4_STEP (MD4_F , b, c, d, a, W[ 7], MD4C00, MD4S03);
  MD4_STEP (MD4_F , a, b, c, d, W[ 8], MD4C00, MD4S00);
  MD4_STEP (MD4_F , d, a, b, c, W[ 9], MD4C00, MD4S01);
  MD4_STEP (MD4_F , c, d, a, b, W[10], MD4C00, MD4S02);
  MD4_STEP (MD4_F , b, c, d, a, W[11], MD4C00, MD4S03);
  MD4_STEP (MD4_F , a, b, c, d, W[12], MD4C00, MD4S00);
  MD4_STEP (MD4_F , d, a, b, c, W[13], MD4C00, MD4S01);
  MD4_STEP (MD4_F , c, d, a, b, W[14], MD4C00, MD4S02);
  MD4_STEP (MD4_F , b, c, d, a, W[15], MD4C00, MD4S03);

  MD4_STEP (MD4_G , a, b, c, d, W[ 0], MD4C01, MD4S10);
  MD4_STEP (MD4_G , d, a, b, c, W[ 4], MD4C01, MD4S11);
  MD4_STEP (MD4_G , c, d, a, b, W[ 8], MD4C01, MD4S12);
  MD4_STEP (MD4_G , b, c, d, a, W[12], MD4C01, MD4S13);
  MD4_STEP (MD4_G , a, b, c, d, W[ 1], MD4C01, MD4S10);
  MD4_STEP (MD4_G , d, a, b, c, W[ 5], MD4C01, MD4S11);
  MD4_STEP (MD4_G , c, d, a, b, W[ 9], MD4C01, MD4S12);
  MD4_STEP (MD4_G , b, c, d, a, W[13], MD4C01, MD4S13);
  MD4_STEP (MD4_G , a, b, c, d, W[ 2], MD4C01, MD4S10);
  MD4_STEP (MD4_G , d, a, b, c, W[ 6], MD4C01, MD4S11);
  MD4_STEP (MD4_G , c, d, a, b, W[10], MD4C01, MD4S12);
  MD4_STEP (MD4_G , b, c, d, a, W[14], MD4C01, MD4S13);
  MD4_STEP (MD4_G , a, b, c, d, W[ 3], MD4C01, MD4S10);
  MD4_STEP (MD4_G , d, a, b, c, W[ 7], MD4C01, MD4S11);
  MD4_STEP (MD4_G , c, d, a, b, W[11], MD4C01, MD4S12);
  MD4_STEP (MD4_G , b, c, d, a, W[15], MD4C01, MD4S13);

  MD4_STEP (MD4_H1, a, b, c, d, W[ 0], MD4C02, MD4S20);
  MD4_STEP (MD4_H2, d, a, b, c, W[ 8], MD4C02, MD4S21);
  MD4_STEP (MD4_H1, c, d, a, b, W[ 4], MD4C02, MD4S22);
  MD4_STEP (MD4_H2, b, c, d, a, W[12], MD4C02, MD4S23);
  MD4_STEP (MD4_H1, a, b, c, d, W[ 2], MD4C02, MD4S20);
  MD4_STEP (MD4_H2, d, a, b, c, W[10], MD4C02, MD4S21);
  MD4_STEP (MD4_H1, c, d, a, b, W[ 6], MD4C02, MD4S22);
  MD4_STEP (MD4_H2, b, c, d, a, W[14], MD4C02, MD4S23);
  MD4_STEP (MD4_H1, a, b, c, d, W[ 1], MD4C02, MD4S20);
  MD4_STEP (MD4_H2, d, a, b, c, W[ 9], MD4C02, MD4S21);
  MD4_STEP (MD4_H1, c, d, a, b, W[ 5], MD4C02, MD4S22);
  MD4_STEP (MD4_H2, b, c, d, a, W[13], MD4C02, MD4S23);
  MD4_STEP (MD4_H1, a, b, c, d, W[ 3], MD4C02, MD4S20);
  MD4_STEP (MD4_H2, d, a, b, c, W[11], MD4C02, MD4S21);
  MD4_STEP (MD4_H1, c, d, a, b, W[ 7], MD4C02, MD4S22);
  MD4_STEP (MD4_H2, b, c, d, a, W[15], MD4C02, MD4S23);

  digests[0] = _mm_add_epi32 (a, digests[0]);
  digests[1] = _mm_add_epi32 (b, digests[1]);
  digests[2] = _mm_add_epi32 (c, digests[2]);
  digests[3] = _mm_add_epi32 (d, digests[3]);
}
Beispiel #11
0
static WEBP_INLINE __m128i GetRGBA32bWithUV(int y, const __m128i uv_part) {
  const __m128i y_part = _mm_loadu_si128(&VP8kYtoRGBA[y].m);
  const __m128i rgba1 = _mm_add_epi32(y_part, uv_part);
  const __m128i rgba2 = _mm_srai_epi32(rgba1, YUV_FIX2);
  return rgba2;
}
Beispiel #12
0
static WEBP_INLINE __m128i LoadUVPart(int u, int v) {
  const __m128i u_part = _mm_loadu_si128(&VP8kUtoRGBA[u].m);
  const __m128i v_part = _mm_loadu_si128(&VP8kVtoRGBA[v].m);
  const __m128i uv_part = _mm_add_epi32(u_part, v_part);
  return uv_part;
}
int
global_sse2_word(int                  queryLength,
                 unsigned short      *profile,
                 const unsigned char *dbSeq,
                 int                  dbLength,
                 unsigned short       gapOpen,
                 unsigned short       gapExtend,
                 unsigned short       ceiling,
                 struct f_struct     *f_str)
{
  int     i, j;

  int     score;
  int     scale;
  int     temp;
  int     distance;

  int     offset;
  int     position;

  int     cmp;
  int     iter;
    
  __m128i *pvH;
  __m128i *pvE;

  __m128i vE, vF, vH;
  __m128i vHNext;
  __m128i vFPrev;

  __m128i vGapOpen;
  __m128i vGapExtend;
  __m128i vCeiling;

  __m128i vScale;
  __m128i vScaleAmt;
  __m128i vScaleTmp;

  __m128i vTemp;
  __m128i vNull;

  __m128i *pvScore;

  scale = 0;
  iter = (queryLength + 7) / 8;
  offset = (queryLength - 1) % iter;
  position = 7 - (queryLength - 1) / iter;

  pvH = (__m128i *)f_str->workspace;
  pvE = pvH + iter;

  /* Load gap opening penalty to all elements of a constant */
  vGapOpen = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapOpen = _mm_insert_epi16 (vGapOpen, gapOpen, 0);
  vGapOpen = _mm_shufflelo_epi16 (vGapOpen, 0);
  vGapOpen = _mm_shuffle_epi32 (vGapOpen, 0);

  /* Load gap extension penalty to all elements of a constant */
  vGapExtend = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vGapExtend = _mm_insert_epi16 (vGapExtend, gapExtend, 0);
  vGapExtend = _mm_shufflelo_epi16 (vGapExtend, 0);
  vGapExtend = _mm_shuffle_epi32 (vGapExtend, 0);

  /* Generate the ceiling before scaling */
  vTemp = _mm_setzero_si128();	/* transfered from Apple Devel smith_waterman_sse2.c fix */
  vTemp = _mm_insert_epi16 (vTemp, ceiling, 0);
  vTemp = _mm_shufflelo_epi16 (vTemp, 0);
  vTemp = _mm_shuffle_epi32 (vTemp, 0);
  vCeiling = _mm_cmpeq_epi16 (vTemp, vTemp);
  vCeiling = _mm_srli_epi16 (vCeiling, 1);
  vCeiling = _mm_subs_epi16 (vCeiling, vTemp);
  vCeiling = _mm_subs_epi16 (vCeiling, vGapOpen);

  vNull = _mm_cmpeq_epi16 (vTemp, vTemp);
  vNull = _mm_slli_epi16 (vNull, 15);
  vScaleAmt = _mm_xor_si128 (vNull, vNull);

  /* Zero out the storage vector */
  vTemp = _mm_adds_epi16 (vNull, vGapOpen);
  for (i = 0; i < iter; i++) {
    _mm_store_si128 (pvH + i, vTemp);
    _mm_store_si128 (pvE + i, vNull);
  }

  /* initialize F */
  vF = vNull;
  vFPrev = vNull;

  /* load and scale H for the next round */
  vTemp = _mm_srli_si128 (vGapOpen, 14);
  vH = _mm_load_si128 (pvH + iter - 1);
  vH = _mm_adds_epi16 (vH, vTemp);

  for (i = 0; i < dbLength; ++i) {
    /* fetch first data asap. */
    pvScore = (__m128i *) profile + dbSeq[i] * iter;

    vF = vNull;

    vH = _mm_max_epi16 (vH, vFPrev);
    for (j = 0; j < iter; j++) {
      /* correct H from the previous columns F */
      vHNext = _mm_load_si128 (pvH + j);
      vHNext = _mm_max_epi16 (vHNext, vFPrev);

      /* load and correct E value */
      vE = _mm_load_si128 (pvE + j);
      vTemp = _mm_subs_epi16 (vHNext, vGapOpen);
      vE = _mm_max_epi16 (vE, vTemp);
      _mm_store_si128 (pvE + j, vE);

      /* add score to vH */
      vH = _mm_adds_epi16 (vH, *pvScore++);

      /* get max from vH, vE and vF */
      vH = _mm_max_epi16 (vH, vE);
      vH = _mm_max_epi16 (vH, vF);
      _mm_store_si128 (pvH + j, vH);

      /* update vF value */
      vH = _mm_subs_epi16 (vH, vGapOpen);
      vF = _mm_max_epi16 (vF, vH);

      /* load the next h values */
      vH = vHNext;
    }

    /* check if we need to scale before the next round */
    vTemp = _mm_cmpgt_epi16 (vF, vCeiling);
    cmp  = _mm_movemask_epi8 (vTemp);

    /* broadcast F values */
    vF = _mm_xor_si128 (vF, vNull);

    vTemp  = _mm_slli_si128 (vF, 2);
    vTemp = _mm_subs_epu16 (vTemp, vScaleAmt);
    vF = max_epu16 (vF, vTemp);

    vTemp  = _mm_slli_si128 (vF, 4);
    vScaleTmp = _mm_slli_si128 (vScaleAmt, 2);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vScaleAmt);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    vTemp = _mm_slli_si128 (vScaleTmp, 4);
    vScaleTmp = _mm_adds_epu16 (vScaleTmp, vTemp);
    vTemp  = _mm_slli_si128 (vF, 8);
    vTemp = _mm_subs_epu16 (vTemp, vScaleTmp);
    vF = max_epu16 (vF, vTemp);

    /* scale if necessary */
    if (cmp != 0x0000) {
      __m128i vScale1;
      __m128i vScale2;

      vScale = _mm_slli_si128 (vF, 2);
      vScale = _mm_subs_epu16 (vScale, vGapOpen);
      vScale = _mm_subs_epu16 (vScale, vScaleAmt);

      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vScale, vTemp);
      vScaleAmt = _mm_adds_epu16 (vScaleAmt, vTemp);
      vTemp = _mm_slli_si128 (vScale, 2);
      vTemp = _mm_subs_epu16 (vTemp, vScale);
      vScaleAmt = _mm_subs_epu16 (vScaleAmt, vTemp);

      /* rescale the previous F */
      vF = _mm_subs_epu16 (vF, vScale);

      /* check if we can continue in signed 16-bits */
      vTemp = _mm_xor_si128 (vF, vNull);
      vTemp = _mm_cmpgt_epi16 (vTemp, vCeiling);
      cmp  = _mm_movemask_epi8 (vTemp);
      if (cmp != 0x0000) {
        return OVERFLOW_SCORE;
      }

      vTemp   = _mm_adds_epi16 (vCeiling, vCeiling);
      vScale1 = _mm_subs_epu16 (vScale, vTemp);
      vScale2 = _mm_subs_epu16 (vScale, vScale1);

      /* scale all the vectors */
      for (j = 0; j < iter; j++) {
        /* load H and E */
        vH = _mm_load_si128 (pvH + j);
        vE = _mm_load_si128 (pvE + j);

        /* get max from vH, vE and vF */
        vH = _mm_subs_epi16 (vH, vScale1);
        vH = _mm_subs_epi16 (vH, vScale2);
        vE = _mm_subs_epi16 (vE, vScale1);
        vE = _mm_subs_epi16 (vE, vScale2);

        /* save the H and E */
        _mm_store_si128 (pvH + j, vH);
        _mm_store_si128 (pvE + j, vE);
      }

      vScale = vScaleAmt;
      for (j = 0; j < position; ++j) {
        vScale = _mm_slli_si128 (vScale, 2);
      }

      /* calculate the final scaling amount */
      vTemp   = _mm_xor_si128 (vTemp, vTemp);
      vScale1 = _mm_unpacklo_epi16 (vScale, vTemp);
      vScale2 = _mm_unpackhi_epi16 (vScale, vTemp);
      vScale  = _mm_add_epi32 (vScale1, vScale2);
      vTemp = _mm_srli_si128 (vScale, 8);
      vScale = _mm_add_epi32 (vScale, vTemp);
      vTemp = _mm_srli_si128 (vScale, 4);
      vScale = _mm_add_epi32 (vScale, vTemp);
      scale = (int) (unsigned short) _mm_extract_epi16 (vScale, 0);
      temp  = (int) (unsigned short) _mm_extract_epi16 (vScale, 1);
      scale = scale + (temp << 16);
    }

    /* scale the F value for the next round */
    vFPrev = _mm_slli_si128 (vF, 2);
    vFPrev = _mm_subs_epu16 (vFPrev, vScaleAmt);
    vFPrev = _mm_xor_si128 (vFPrev, vNull);

    /* load and scale H for the next round */
    vH = _mm_load_si128 (pvH + iter - 1);
    vH = _mm_xor_si128 (vH, vNull);
    vH = _mm_slli_si128 (vH, 2);
    vH = _mm_subs_epu16 (vH, vScaleAmt);
    vH = _mm_insert_epi16 (vH, gapOpen, 0);
    vH = _mm_xor_si128 (vH, vNull);
  }

  vH = _mm_load_si128 (pvH + offset);
  vH = _mm_max_epi16 (vH, vFPrev);
  for (j = 0; j < position; ++j) {
    vH = _mm_slli_si128 (vH, 2);
  }
  score = (int) (signed short) _mm_extract_epi16 (vH, 7);
  score = score + SHORT_BIAS;

  /* return largest score */
  distance = (queryLength + dbLength) * gapExtend;
  score = score - (gapOpen * 2) - distance + scale;

  return score;
}
Beispiel #14
0
OD_SIMD_INLINE __m128i od_dct_mul_epi32(__m128i val, int32_t scale,
 int32_t offset, int32_t shift) {
  return _mm_srai_epi32(_mm_add_epi32(OD_MULLO_EPI32(val, scale),
   _mm_set1_epi32(offset)), shift);
}
Beispiel #15
0
OD_SIMD_INLINE __m128i od_unbiased_rshift_epi32(__m128i a, int b) {
  return _mm_srai_epi32(_mm_add_epi32(_mm_srli_epi32(a, 32 - b), a), b);
}
Beispiel #16
0
static WEBP_INLINE int DoQuantizeBlock_SSE41(int16_t in[16], int16_t out[16],
                                             const uint16_t* const sharpen,
                                             const VP8Matrix* const mtx) {
  const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL);
  const __m128i zero = _mm_setzero_si128();
  __m128i out0, out8;
  __m128i packed_out;

  // Load all inputs.
  __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]);
  __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]);
  const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]);
  const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]);
  const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]);
  const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]);

  // coeff = abs(in)
  __m128i coeff0 = _mm_abs_epi16(in0);
  __m128i coeff8 = _mm_abs_epi16(in8);

  // coeff = abs(in) + sharpen
  if (sharpen != NULL) {
    const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]);
    const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]);
    coeff0 = _mm_add_epi16(coeff0, sharpen0);
    coeff8 = _mm_add_epi16(coeff8, sharpen8);
  }

  // out = (coeff * iQ + B) >> QFIX
  {
    // doing calculations with 32b precision (QFIX=17)
    // out = (coeff * iQ)
    const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0);
    const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0);
    const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8);
    const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8);
    __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H);
    __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H);
    __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H);
    // out = (coeff * iQ + B)
    const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]);
    const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]);
    const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]);
    const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]);
    out_00 = _mm_add_epi32(out_00, bias_00);
    out_04 = _mm_add_epi32(out_04, bias_04);
    out_08 = _mm_add_epi32(out_08, bias_08);
    out_12 = _mm_add_epi32(out_12, bias_12);
    // out = QUANTDIV(coeff, iQ, B, QFIX)
    out_00 = _mm_srai_epi32(out_00, QFIX);
    out_04 = _mm_srai_epi32(out_04, QFIX);
    out_08 = _mm_srai_epi32(out_08, QFIX);
    out_12 = _mm_srai_epi32(out_12, QFIX);

    // pack result as 16b
    out0 = _mm_packs_epi32(out_00, out_04);
    out8 = _mm_packs_epi32(out_08, out_12);

    // if (coeff > 2047) coeff = 2047
    out0 = _mm_min_epi16(out0, max_coeff_2047);
    out8 = _mm_min_epi16(out8, max_coeff_2047);
  }

  // put sign back
  out0 = _mm_sign_epi16(out0, in0);
  out8 = _mm_sign_epi16(out8, in8);

  // in = out * Q
  in0 = _mm_mullo_epi16(out0, q0);
  in8 = _mm_mullo_epi16(out8, q8);

  _mm_storeu_si128((__m128i*)&in[0], in0);
  _mm_storeu_si128((__m128i*)&in[8], in8);

  // zigzag the output before storing it. The re-ordering is:
  //    0 1 2 3 4 5 6 7 | 8  9 10 11 12 13 14 15
  // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15
  // There's only two misplaced entries ([8] and [7]) that are crossing the
  // reg's boundaries.
  // We use pshufb instead of pshuflo/pshufhi.
  {
    const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6);
    const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1);
    const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo);
    const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7);  // extract #7
    const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7);
    const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1);
    const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi);
    const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8);  // extract #8
    const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8);
    const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7);
    _mm_storeu_si128((__m128i*)&out[0], out_z0);
    _mm_storeu_si128((__m128i*)&out[8], out_z8);
    packed_out = _mm_packs_epi16(out_z0, out_z8);
  }

  // detect if all 'out' values are zeroes or not
  return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff);
}
Beispiel #17
0
// Hadamard transform
// Returns the weighted sum of the absolute value of transformed coefficients.
// w[] contains a row-major 4 by 4 symmetric matrix.
static int TTransform_SSE41(const uint8_t* inA, const uint8_t* inB,
                            const uint16_t* const w) {
  int32_t sum[4];
  __m128i tmp_0, tmp_1, tmp_2, tmp_3;

  // Load and combine inputs.
  {
    const __m128i inA_0 = _mm_loadu_si128((const __m128i*)&inA[BPS * 0]);
    const __m128i inA_1 = _mm_loadu_si128((const __m128i*)&inA[BPS * 1]);
    const __m128i inA_2 = _mm_loadu_si128((const __m128i*)&inA[BPS * 2]);
    // In SSE4.1, with gcc 4.8 at least (maybe other versions),
    // _mm_loadu_si128 is faster than _mm_loadl_epi64. But for the last lump
    // of inA and inB, _mm_loadl_epi64 is still used not to have an out of
    // bound read.
    const __m128i inA_3 = _mm_loadl_epi64((const __m128i*)&inA[BPS * 3]);
    const __m128i inB_0 = _mm_loadu_si128((const __m128i*)&inB[BPS * 0]);
    const __m128i inB_1 = _mm_loadu_si128((const __m128i*)&inB[BPS * 1]);
    const __m128i inB_2 = _mm_loadu_si128((const __m128i*)&inB[BPS * 2]);
    const __m128i inB_3 = _mm_loadl_epi64((const __m128i*)&inB[BPS * 3]);

    // Combine inA and inB (we'll do two transforms in parallel).
    const __m128i inAB_0 = _mm_unpacklo_epi32(inA_0, inB_0);
    const __m128i inAB_1 = _mm_unpacklo_epi32(inA_1, inB_1);
    const __m128i inAB_2 = _mm_unpacklo_epi32(inA_2, inB_2);
    const __m128i inAB_3 = _mm_unpacklo_epi32(inA_3, inB_3);
    tmp_0 = _mm_cvtepu8_epi16(inAB_0);
    tmp_1 = _mm_cvtepu8_epi16(inAB_1);
    tmp_2 = _mm_cvtepu8_epi16(inAB_2);
    tmp_3 = _mm_cvtepu8_epi16(inAB_3);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33
  }

  // Vertical pass first to avoid a transpose (vertical and horizontal passes
  // are commutative because w/kWeightY is symmetric) and subsequent transpose.
  {
    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);
    // a00 a01 a02 a03   b00 b01 b02 b03
    // a10 a11 a12 a13   b10 b11 b12 b13
    // a20 a21 a22 a23   b20 b21 b22 b23
    // a30 a31 a32 a33   b30 b31 b32 b33

    // Transpose the two 4x4.
    VP8Transpose_2_4x4_16b(&b0, &b1, &b2, &b3, &tmp_0, &tmp_1, &tmp_2, &tmp_3);
  }

  // Horizontal pass and difference of weighted sums.
  {
    // Load all inputs.
    const __m128i w_0 = _mm_loadu_si128((const __m128i*)&w[0]);
    const __m128i w_8 = _mm_loadu_si128((const __m128i*)&w[8]);

    // Calculate a and b (two 4x4 at once).
    const __m128i a0 = _mm_add_epi16(tmp_0, tmp_2);
    const __m128i a1 = _mm_add_epi16(tmp_1, tmp_3);
    const __m128i a2 = _mm_sub_epi16(tmp_1, tmp_3);
    const __m128i a3 = _mm_sub_epi16(tmp_0, tmp_2);
    const __m128i b0 = _mm_add_epi16(a0, a1);
    const __m128i b1 = _mm_add_epi16(a3, a2);
    const __m128i b2 = _mm_sub_epi16(a3, a2);
    const __m128i b3 = _mm_sub_epi16(a0, a1);

    // Separate the transforms of inA and inB.
    __m128i A_b0 = _mm_unpacklo_epi64(b0, b1);
    __m128i A_b2 = _mm_unpacklo_epi64(b2, b3);
    __m128i B_b0 = _mm_unpackhi_epi64(b0, b1);
    __m128i B_b2 = _mm_unpackhi_epi64(b2, b3);

    A_b0 = _mm_abs_epi16(A_b0);
    A_b2 = _mm_abs_epi16(A_b2);
    B_b0 = _mm_abs_epi16(B_b0);
    B_b2 = _mm_abs_epi16(B_b2);

    // weighted sums
    A_b0 = _mm_madd_epi16(A_b0, w_0);
    A_b2 = _mm_madd_epi16(A_b2, w_8);
    B_b0 = _mm_madd_epi16(B_b0, w_0);
    B_b2 = _mm_madd_epi16(B_b2, w_8);
    A_b0 = _mm_add_epi32(A_b0, A_b2);
    B_b0 = _mm_add_epi32(B_b0, B_b2);

    // difference of weighted sums
    A_b2 = _mm_sub_epi32(A_b0, B_b0);
    _mm_storeu_si128((__m128i*)&sum[0], A_b2);
  }
  return sum[0] + sum[1] + sum[2] + sum[3];
}
/* Compute reflection coefficients from input signal */
void silk_burg_modified_sse4_1(
    opus_int32                  *res_nrg,           /* O    Residual energy                                             */
    opus_int                    *res_nrg_Q,         /* O    Residual energy Q value                                     */
    opus_int32                  A_Q16[],            /* O    Prediction coefficients (length order)                      */
    const opus_int16            x[],                /* I    Input signal, length: nb_subfr * (D + subfr_length)       */
    const opus_int32            minInvGain_Q30,     /* I    Inverse of max prediction gain                              */
    const opus_int              subfr_length,       /* I    Input signal subframe length (incl. D preceding samples)    */
    const opus_int              nb_subfr,           /* I    Number of subframes stacked in x                            */
    const opus_int              D,                  /* I    Order                                                       */
    int                         arch                /* I    Run-time architecture                                       */
)
{
    opus_int         k, n, s, lz, rshifts, rshifts_extra, reached_max_gain;
    opus_int32       C0, num, nrg, rc_Q31, invGain_Q30, Atmp_QA, Atmp1, tmp1, tmp2, x1, x2;
    const opus_int16 *x_ptr;
    opus_int32       C_first_row[ SILK_MAX_ORDER_LPC ];
    opus_int32       C_last_row[  SILK_MAX_ORDER_LPC ];
    opus_int32       Af_QA[       SILK_MAX_ORDER_LPC ];
    opus_int32       CAf[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       CAb[ SILK_MAX_ORDER_LPC + 1 ];
    opus_int32       xcorr[ SILK_MAX_ORDER_LPC ];

    __m128i FIRST_3210, LAST_3210, ATMP_3210, TMP1_3210, TMP2_3210, T1_3210, T2_3210, PTR_3210, SUBFR_3210, X1_3210, X2_3210;
    __m128i CONST1 = _mm_set1_epi32(1);

    silk_assert(subfr_length * nb_subfr <= MAX_FRAME_SIZE);

    /* Compute autocorrelations, added over subframes */
    silk_sum_sqr_shift(&C0, &rshifts, x, nb_subfr * subfr_length);
    if(rshifts > MAX_RSHIFTS) {
        C0 = silk_LSHIFT32(C0, rshifts - MAX_RSHIFTS);
        silk_assert(C0 > 0);
        rshifts = MAX_RSHIFTS;
    } else {
        lz = silk_CLZ32(C0) - 1;
        rshifts_extra = N_BITS_HEAD_ROOM - lz;
        if(rshifts_extra > 0) {
            rshifts_extra = silk_min(rshifts_extra, MAX_RSHIFTS - rshifts);
            C0 = silk_RSHIFT32(C0, rshifts_extra);
        } else {
            rshifts_extra = silk_max(rshifts_extra, MIN_RSHIFTS - rshifts);
            C0 = silk_LSHIFT32(C0, -rshifts_extra);
        }
        rshifts += rshifts_extra;
    }
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */
    silk_memset(C_first_row, 0, SILK_MAX_ORDER_LPC * sizeof(opus_int32));
    if(rshifts > 0) {
        for(s = 0; s < nb_subfr; s++) {
            x_ptr = x + s * subfr_length;
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += (opus_int32)silk_RSHIFT64(
                    silk_inner_prod16_aligned_64(x_ptr, x_ptr + n, subfr_length - n, arch), rshifts);
            }
        }
    } else {
        for(s = 0; s < nb_subfr; s++) {
            int i;
            opus_int32 d;
            x_ptr = x + s * subfr_length;
            celt_pitch_xcorr(x_ptr, x_ptr + 1, xcorr, subfr_length - D, D, arch);
            for(n = 1; n < D + 1; n++) {
               for (i = n + subfr_length - D, d = 0; i < subfr_length; i++)
                  d = MAC16_16(d, x_ptr[ i ], x_ptr[ i - n ]);
               xcorr[ n - 1 ] += d;
            }
            for(n = 1; n < D + 1; n++) {
                C_first_row[ n - 1 ] += silk_LSHIFT32(xcorr[ n - 1 ], -rshifts);
            }
        }
    }
    silk_memcpy(C_last_row, C_first_row, SILK_MAX_ORDER_LPC * sizeof(opus_int32));

    /* Initialize */
    CAb[ 0 ] = CAf[ 0 ] = C0 + silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0) + 1;                                /* Q(-rshifts) */

    invGain_Q30 = (opus_int32)1 << 30;
    reached_max_gain = 0;
    for(n = 0; n < D; n++) {
        /* Update first row of correlation matrix (without first element) */
        /* Update last row of correlation matrix (without last element, stored in reversed order) */
        /* Update C * Af */
        /* Update C * flipud(Af) (stored in reversed order) */
        if(rshifts > -2) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    16 - rshifts);        /* Q(16-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 16 - rshifts);        /* Q(16-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    QA - 16);             /* Q(QA-16) */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], QA - 16);             /* Q(QA-16) */
                for(k = 0; k < n; k++) {
                    C_first_row[ k ] = silk_SMLAWB(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_SMLAWB(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp_QA = Af_QA[ k ];
                    tmp1 = silk_SMLAWB(tmp1, Atmp_QA, x_ptr[ n - k - 1 ]           );                 /* Q(QA-16) */
                    tmp2 = silk_SMLAWB(tmp2, Atmp_QA, x_ptr[ subfr_length - n + k ]);                 /* Q(QA-16) */
                }
                tmp1 = silk_LSHIFT32(-tmp1, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                tmp2 = silk_LSHIFT32(-tmp2, 32 - QA - rshifts);                                       /* Q(16-rshifts) */
                for(k = 0; k <= n; k++) {
                    CAf[ k ] = silk_SMLAWB(CAf[ k ], tmp1, x_ptr[ n - k ]                   );        /* Q(-rshift) */
                    CAb[ k ] = silk_SMLAWB(CAb[ k ], tmp2, x_ptr[ subfr_length - n + k - 1 ]);        /* Q(-rshift) */
                }
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                x1  = -silk_LSHIFT32((opus_int32)x_ptr[ n ],                    -rshifts);            /* Q(-rshifts) */
                x2  = -silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], -rshifts);            /* Q(-rshifts) */
                tmp1 = silk_LSHIFT32((opus_int32)x_ptr[ n ],                    17);                  /* Q17 */
                tmp2 = silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n - 1 ], 17);                  /* Q17 */

                X1_3210 = _mm_set1_epi32(x1);
                X2_3210 = _mm_set1_epi32(x2);
                TMP1_3210 = _mm_setzero_si128();
                TMP2_3210 = _mm_setzero_si128();
                for(k = 0; k < n - 3; k += 4) {
                    PTR_3210   = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 1 - 3 ]);
                    SUBFR_3210 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k ]);
                    FIRST_3210 = _mm_loadu_si128((__m128i *)&C_first_row[ k ]);
                    PTR_3210   = _mm_shuffle_epi32(PTR_3210,  _MM_SHUFFLE(0, 1, 2, 3));
                    LAST_3210  = _mm_loadu_si128((__m128i *)&C_last_row[ k ]);
                    ATMP_3210  = _mm_loadu_si128((__m128i *)&Af_QA[ k ]);

                    T1_3210 = _mm_mullo_epi32(PTR_3210, X1_3210);
                    T2_3210 = _mm_mullo_epi32(SUBFR_3210, X2_3210);

                    ATMP_3210 = _mm_srai_epi32(ATMP_3210, 7);
                    ATMP_3210 = _mm_add_epi32(ATMP_3210, CONST1);
                    ATMP_3210 = _mm_srai_epi32(ATMP_3210, 1);

                    FIRST_3210 = _mm_add_epi32(FIRST_3210, T1_3210);
                    LAST_3210 = _mm_add_epi32(LAST_3210, T2_3210);

                    PTR_3210   = _mm_mullo_epi32(ATMP_3210, PTR_3210);
                    SUBFR_3210   = _mm_mullo_epi32(ATMP_3210, SUBFR_3210);

                    _mm_storeu_si128((__m128i *)&C_first_row[ k ], FIRST_3210);
                    _mm_storeu_si128((__m128i *)&C_last_row[ k ], LAST_3210);

                    TMP1_3210 = _mm_add_epi32(TMP1_3210, PTR_3210);
                    TMP2_3210 = _mm_add_epi32(TMP2_3210, SUBFR_3210);
                }

                TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_unpackhi_epi64(TMP1_3210, TMP1_3210));
                TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_unpackhi_epi64(TMP2_3210, TMP2_3210));
                TMP1_3210 = _mm_add_epi32(TMP1_3210, _mm_shufflelo_epi16(TMP1_3210, 0x0E));
                TMP2_3210 = _mm_add_epi32(TMP2_3210, _mm_shufflelo_epi16(TMP2_3210, 0x0E));

                tmp1 += _mm_cvtsi128_si32(TMP1_3210);
                tmp2 += _mm_cvtsi128_si32(TMP2_3210);

                for(; k < n; k++) {
                    C_first_row[ k ] = silk_MLA(C_first_row[ k ], x1, x_ptr[ n - k - 1 ]           ); /* Q(-rshifts) */
                    C_last_row[ k ]  = silk_MLA(C_last_row[ k ],  x2, x_ptr[ subfr_length - n + k ]); /* Q(-rshifts) */
                    Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 17);                                   /* Q17 */
                    tmp1 = silk_MLA(tmp1, x_ptr[ n - k - 1 ],            Atmp1);                      /* Q17 */
                    tmp2 = silk_MLA(tmp2, x_ptr[ subfr_length - n + k ], Atmp1);                      /* Q17 */
                }

                tmp1 = -tmp1;                /* Q17 */
                tmp2 = -tmp2;                /* Q17 */

                {
                    __m128i xmm_tmp1, xmm_tmp2;
                    __m128i xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1;
                    __m128i xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1;

                    xmm_tmp1 = _mm_set1_epi32(tmp1);
                    xmm_tmp2 = _mm_set1_epi32(tmp2);

                    for(k = 0; k <= n - 3; k += 4) {
                        xmm_x_ptr_n_k_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ n - k - 3 ]);
                        xmm_x_ptr_sub_x2x0 = OP_CVTEPI16_EPI32_M64(&x_ptr[ subfr_length - n + k - 1 ]);

                        xmm_x_ptr_n_k_x2x0 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 1, 2, 3));

                        xmm_x_ptr_n_k_x2x0 = _mm_slli_epi32(xmm_x_ptr_n_k_x2x0, -rshifts - 1);
                        xmm_x_ptr_sub_x2x0 = _mm_slli_epi32(xmm_x_ptr_sub_x2x0, -rshifts - 1);

                        /* equal shift right 4 bytes, xmm_x_ptr_n_k_x3x1 = _mm_srli_si128(xmm_x_ptr_n_k_x2x0, 4)*/
                        xmm_x_ptr_n_k_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_n_k_x2x0, _MM_SHUFFLE(0, 3, 2, 1));
                        xmm_x_ptr_sub_x3x1 = _mm_shuffle_epi32(xmm_x_ptr_sub_x2x0, _MM_SHUFFLE(0, 3, 2, 1));

                        xmm_x_ptr_n_k_x2x0 = _mm_mul_epi32(xmm_x_ptr_n_k_x2x0, xmm_tmp1);
                        xmm_x_ptr_n_k_x3x1 = _mm_mul_epi32(xmm_x_ptr_n_k_x3x1, xmm_tmp1);
                        xmm_x_ptr_sub_x2x0 = _mm_mul_epi32(xmm_x_ptr_sub_x2x0, xmm_tmp2);
                        xmm_x_ptr_sub_x3x1 = _mm_mul_epi32(xmm_x_ptr_sub_x3x1, xmm_tmp2);

                        xmm_x_ptr_n_k_x2x0 = _mm_srli_epi64(xmm_x_ptr_n_k_x2x0, 16);
                        xmm_x_ptr_n_k_x3x1 = _mm_slli_epi64(xmm_x_ptr_n_k_x3x1, 16);
                        xmm_x_ptr_sub_x2x0 = _mm_srli_epi64(xmm_x_ptr_sub_x2x0, 16);
                        xmm_x_ptr_sub_x3x1 = _mm_slli_epi64(xmm_x_ptr_sub_x3x1, 16);

                        xmm_x_ptr_n_k_x2x0 = _mm_blend_epi16(xmm_x_ptr_n_k_x2x0, xmm_x_ptr_n_k_x3x1, 0xCC);
                        xmm_x_ptr_sub_x2x0 = _mm_blend_epi16(xmm_x_ptr_sub_x2x0, xmm_x_ptr_sub_x3x1, 0xCC);

                        X1_3210  = _mm_loadu_si128((__m128i *)&CAf[ k ]);
                        PTR_3210 = _mm_loadu_si128((__m128i *)&CAb[ k ]);

                        X1_3210  = _mm_add_epi32(X1_3210, xmm_x_ptr_n_k_x2x0);
                        PTR_3210 = _mm_add_epi32(PTR_3210, xmm_x_ptr_sub_x2x0);

                        _mm_storeu_si128((__m128i *)&CAf[ k ], X1_3210);
                        _mm_storeu_si128((__m128i *)&CAb[ k ], PTR_3210);
                    }

                    for(; k <= n; k++) {
                        CAf[ k ] = silk_SMLAWW(CAf[ k ], tmp1,
                            silk_LSHIFT32((opus_int32)x_ptr[ n - k ], -rshifts - 1));                    /* Q(-rshift) */
                        CAb[ k ] = silk_SMLAWW(CAb[ k ], tmp2,
                            silk_LSHIFT32((opus_int32)x_ptr[ subfr_length - n + k - 1 ], -rshifts - 1)); /* Q(-rshift) */
                    }
                }
            }
        }

        /* Calculate nominator and denominator for the next order reflection (parcor) coefficient */
        tmp1 = C_first_row[ n ];                                                                        /* Q(-rshifts) */
        tmp2 = C_last_row[ n ];                                                                         /* Q(-rshifts) */
        num  = 0;                                                                                       /* Q(-rshifts) */
        nrg  = silk_ADD32(CAb[ 0 ], CAf[ 0 ]);                                                        /* Q(1-rshifts) */
        for(k = 0; k < n; k++) {
            Atmp_QA = Af_QA[ k ];
            lz = silk_CLZ32(silk_abs(Atmp_QA)) - 1;
            lz = silk_min(32 - QA, lz);
            Atmp1 = silk_LSHIFT32(Atmp_QA, lz);                                                       /* Q(QA + lz) */

            tmp1 = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(C_last_row[  n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            tmp2 = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(C_first_row[ n - k - 1 ], Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            num  = silk_ADD_LSHIFT32(num,  silk_SMMUL(CAb[ n - k ],             Atmp1), 32 - QA - lz);  /* Q(-rshifts) */
            nrg  = silk_ADD_LSHIFT32(nrg,  silk_SMMUL(silk_ADD32(CAb[ k + 1 ], CAf[ k + 1 ]),
                                                                                Atmp1), 32 - QA - lz);    /* Q(1-rshifts) */
        }
        CAf[ n + 1 ] = tmp1;                                                                            /* Q(-rshifts) */
        CAb[ n + 1 ] = tmp2;                                                                            /* Q(-rshifts) */
        num = silk_ADD32(num, tmp2);                                                                  /* Q(-rshifts) */
        num = silk_LSHIFT32(-num, 1);                                                                 /* Q(1-rshifts) */

        /* Calculate the next order reflection (parcor) coefficient */
        if(silk_abs(num) < nrg) {
            rc_Q31 = silk_DIV32_varQ(num, nrg, 31);
        } else {
            rc_Q31 = (num > 0) ? silk_int32_MAX : silk_int32_MIN;
        }

        /* Update inverse prediction gain */
        tmp1 = ((opus_int32)1 << 30) - silk_SMMUL(rc_Q31, rc_Q31);
        tmp1 = silk_LSHIFT(silk_SMMUL(invGain_Q30, tmp1), 2);
        if(tmp1 <= minInvGain_Q30) {
            /* Max prediction gain exceeded; set reflection coefficient such that max prediction gain is exactly hit */
            tmp2 = ((opus_int32)1 << 30) - silk_DIV32_varQ(minInvGain_Q30, invGain_Q30, 30);            /* Q30 */
            rc_Q31 = silk_SQRT_APPROX(tmp2);                                                  /* Q15 */
            /* Newton-Raphson iteration */
            rc_Q31 = silk_RSHIFT32(rc_Q31 + silk_DIV32(tmp2, rc_Q31), 1);                   /* Q15 */
            rc_Q31 = silk_LSHIFT32(rc_Q31, 16);                                               /* Q31 */
            if(num < 0) {
                /* Ensure adjusted reflection coefficients has the original sign */
                rc_Q31 = -rc_Q31;
            }
            invGain_Q30 = minInvGain_Q30;
            reached_max_gain = 1;
        } else {
            invGain_Q30 = tmp1;
        }

        /* Update the AR coefficients */
        for(k = 0; k < (n + 1) >> 1; k++) {
            tmp1 = Af_QA[ k ];                                                                  /* QA */
            tmp2 = Af_QA[ n - k - 1 ];                                                          /* QA */
            Af_QA[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);      /* QA */
            Af_QA[ n - k - 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);      /* QA */
        }
        Af_QA[ n ] = silk_RSHIFT32(rc_Q31, 31 - QA);                                          /* QA */

        if(reached_max_gain) {
            /* Reached max prediction gain; set remaining coefficients to zero and exit loop */
            for(k = n + 1; k < D; k++) {
                Af_QA[ k ] = 0;
            }
            break;
        }

        /* Update C * Af and C * Ab */
        for(k = 0; k <= n + 1; k++) {
            tmp1 = CAf[ k ];                                                                    /* Q(-rshifts) */
            tmp2 = CAb[ n - k + 1 ];                                                            /* Q(-rshifts) */
            CAf[ k ]         = silk_ADD_LSHIFT32(tmp1, silk_SMMUL(tmp2, rc_Q31), 1);        /* Q(-rshifts) */
            CAb[ n - k + 1 ] = silk_ADD_LSHIFT32(tmp2, silk_SMMUL(tmp1, rc_Q31), 1);        /* Q(-rshifts) */
        }
    }

    if(reached_max_gain) {
        for(k = 0; k < D; k++) {
            /* Scale coefficients */
            A_Q16[ k ] = -silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);
        }
        /* Subtract energy of preceding samples from C0 */
        if(rshifts > 0) {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= (opus_int32)silk_RSHIFT64(silk_inner_prod16_aligned_64(x_ptr, x_ptr, D, arch), rshifts);
            }
        } else {
            for(s = 0; s < nb_subfr; s++) {
                x_ptr = x + s * subfr_length;
                C0 -= silk_LSHIFT32(silk_inner_prod_aligned(x_ptr, x_ptr, D, arch), -rshifts);
            }
        }
        /* Approximate residual energy */
        *res_nrg = silk_LSHIFT(silk_SMMUL(invGain_Q30, C0), 2);
        *res_nrg_Q = -rshifts;
    } else {
        /* Return residual energy */
        nrg  = CAf[ 0 ];                                                                            /* Q(-rshifts) */
        tmp1 = (opus_int32)1 << 16;                                                                             /* Q16 */
        for(k = 0; k < D; k++) {
            Atmp1 = silk_RSHIFT_ROUND(Af_QA[ k ], QA - 16);                                       /* Q16 */
            nrg  = silk_SMLAWW(nrg, CAf[ k + 1 ], Atmp1);                                         /* Q(-rshifts) */
            tmp1 = silk_SMLAWW(tmp1, Atmp1, Atmp1);                                               /* Q16 */
            A_Q16[ k ] = -Atmp1;
        }
        *res_nrg = silk_SMLAWW(nrg, silk_SMMUL(SILK_FIX_CONST(FIND_LPC_COND_FAC, 32), C0), -tmp1);/* Q(-rshifts) */
        *res_nrg_Q = -rshifts;
    }
}
Beispiel #19
0
// arithmetic operators
RETi ADD( const __m128i x, const __m128i y ) { return _mm_add_epi32(x,y); }
void minmax_vec2(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_)
{
	// We suppose that pointers are aligned on an 16-byte boundary
	
	// Initialise SSE registers
	__m128i sse_idx_min = _mm_setzero_si128();
	__m128i sse_idx_max = _mm_setzero_si128();
	__m128 sse_min = _mm_set1_ps(FLT_MAX);
	__m128 sse_max = _mm_set1_ps(FLT_MIN);

	// We will unroll the for-loop by for, thus doing
	// (n/4) iterations.
	const uint32_t n_sse = n & ~3ULL;

	__m128i sse_idx = _mm_set_epi32(3, 2, 1, 0);
	const __m128i sse_4 = _mm_set1_epi32(4);

	for (uint32_t i = 0; i < n_sse; i += 4) {
		const __m128 sse_v = _mm_load_ps(&buf[i]);
		const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min);
		const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max);

		sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min);
		sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max);

		sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); 
		sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); 

		sse_idx = _mm_add_epi32(sse_idx, sse_4);
	}

	// SSE reduction
	__m128 sse_min_permute = _mm_shuffle_epi32(sse_min, 2 | (3<<2));
	__m128 sse_max_permute = _mm_shuffle_epi32(sse_max, 2 | (3<<2));
	__m128i sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 2 | (3<<2));
	__m128i sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 2 | (3<<2));

	__m128 sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min);
	__m128 sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max);
	sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min);
	sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max);
	sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); 
	sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); 

	sse_min_permute = _mm_shuffle_epi32(sse_min, 1);
	sse_max_permute = _mm_shuffle_epi32(sse_max, 1);
	sse_idx_min_permute = _mm_shuffle_epi32(sse_idx_min, 1);
	sse_idx_max_permute = _mm_shuffle_epi32(sse_idx_max, 1);

	sse_cmp_min = _mm_cmplt_ps(sse_min_permute, sse_min);
	sse_cmp_max = _mm_cmpgt_ps(sse_max_permute, sse_max);
	sse_min = _mm_blendv_ps(sse_min, sse_min_permute, sse_cmp_min);
	sse_max = _mm_blendv_ps(sse_max, sse_max_permute, sse_cmp_max);
	sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx_min_permute, (__m128) sse_cmp_min); 
	sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx_max_permute, (__m128) sse_cmp_max); 

	// Epilogue
	float min, max;
	uint32_t idx_min, idx_max;
	_mm_store_ss(&min, sse_min);
	_mm_store_ss(&max, sse_max);
	idx_min = _mm_extract_epi32(sse_idx_min, 0);
	idx_max = _mm_extract_epi32(sse_idx_max, 0);

	for (uint32_t i = n_sse; i < n; i++) {
		const float v = buf[i];
		if (v < min) {
			min = v;
			idx_min = i;
		}
		if (v > max) {
			max = v;
			idx_max = i;
		}
	}

	*idx_min_ = idx_min;
	*min_ = min;
	*idx_max_ = idx_max;
	*max_ = max;
}
Beispiel #21
0
HashReturn Update(hashState *state, const BitSequence *data,
                  DataLength databitlen)
{
  int r;
  __m128i x0;
  __m128i x1;
  __m128i x2;
  __m128i x3;
  __m128i x4;
  __m128i x5;
  __m128i x6;
  __m128i x7;
  __m128i y0;
  __m128i y1;
  __m128i y2;
  __m128i y3;

  while (databitlen >= 8 && state->pos != 0) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    data += 1;
    databitlen -= 8;
    state->pos += 8;
    if (state->pos == 8 * CUBEHASH_BLOCKBYTES) {
      transform(state,CUBEHASH_ROUNDS);
      state->pos = 0;
    }
  }

  x0 = state->x[0];
  x1 = state->x[1];
  x2 = state->x[2];
  x3 = state->x[3];
  x4 = state->x[4];
  x5 = state->x[5];
  x6 = state->x[6];
  x7 = state->x[7];
    
  while (databitlen >= 8 * CUBEHASH_BLOCKBYTES) {
    x0 = _mm_xor_si128(x0,_mm_set_epi32(0,0,0,(crypto_uint32) *(crypto_uint16 *) data));
    data += CUBEHASH_BLOCKBYTES;
    databitlen -= 8 * CUBEHASH_BLOCKBYTES;
    
    for (r = 0;r < CUBEHASH_ROUNDS;++r) {
      x4 = _mm_add_epi32(x0,x4);
      x5 = _mm_add_epi32(x1,x5);
      x6 = _mm_add_epi32(x2,x6);
      x7 = _mm_add_epi32(x3,x7);
      y0 = x2;
      y1 = x3;
      y2 = x0;
      y3 = x1;
      x0 = _mm_xor_si128(_mm_slli_epi32(y0,7),_mm_srli_epi32(y0,25));
      x1 = _mm_xor_si128(_mm_slli_epi32(y1,7),_mm_srli_epi32(y1,25));
      x2 = _mm_xor_si128(_mm_slli_epi32(y2,7),_mm_srli_epi32(y2,25));
      x3 = _mm_xor_si128(_mm_slli_epi32(y3,7),_mm_srli_epi32(y3,25));
      x0 = _mm_xor_si128(x0,x4);
      x1 = _mm_xor_si128(x1,x5);
      x2 = _mm_xor_si128(x2,x6);
      x3 = _mm_xor_si128(x3,x7);
      x4 = _mm_shuffle_epi32(x4,0x4e);
      x5 = _mm_shuffle_epi32(x5,0x4e);
      x6 = _mm_shuffle_epi32(x6,0x4e);
      x7 = _mm_shuffle_epi32(x7,0x4e);
      x4 = _mm_add_epi32(x0,x4);
      x5 = _mm_add_epi32(x1,x5);
      x6 = _mm_add_epi32(x2,x6);
      x7 = _mm_add_epi32(x3,x7);
      y0 = x1;
      y1 = x0;
      y2 = x3;
      y3 = x2;
      x0 = _mm_xor_si128(_mm_slli_epi32(y0,11),_mm_srli_epi32(y0,21));
      x1 = _mm_xor_si128(_mm_slli_epi32(y1,11),_mm_srli_epi32(y1,21));
      x2 = _mm_xor_si128(_mm_slli_epi32(y2,11),_mm_srli_epi32(y2,21));
      x3 = _mm_xor_si128(_mm_slli_epi32(y3,11),_mm_srli_epi32(y3,21));
      x0 = _mm_xor_si128(x0,x4);
      x1 = _mm_xor_si128(x1,x5);
      x2 = _mm_xor_si128(x2,x6);
      x3 = _mm_xor_si128(x3,x7);
      x4 = _mm_shuffle_epi32(x4,0xb1);
      x5 = _mm_shuffle_epi32(x5,0xb1);
      x6 = _mm_shuffle_epi32(x6,0xb1);
      x7 = _mm_shuffle_epi32(x7,0xb1);
    }
  }
  
  state->x[0] = x0;
  state->x[1] = x1;
  state->x[2] = x2;
  state->x[3] = x3;
  state->x[4] = x4;
  state->x[5] = x5;
  state->x[6] = x6;
  state->x[7] = x7;

  while (databitlen >= 8) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    data += 1;
    databitlen -= 8;
    state->pos += 8;
    if (state->pos == 8 * CUBEHASH_BLOCKBYTES) {
      transform(state,CUBEHASH_ROUNDS);
      state->pos = 0;
    }
  }
  if (databitlen > 0) {
    ((unsigned char *) state->x)[state->pos / 8] ^= *data;
    state->pos += databitlen;
  }
  return SUCCESS;
}
void minmax_vec(const uint32_t n, float const* buf, uint32_t* idx_min_, uint32_t* idx_max_, float* min_, float* max_)
{
	// We suppose that pointers are aligned on an 16-byte boundary
	
	// Initialise SSE registers
	__m128i sse_idx_min = _mm_setzero_si128();
	__m128i sse_idx_max = _mm_setzero_si128();
	__m128 sse_min = _mm_set1_ps(FLT_MAX);
	__m128 sse_max = _mm_set1_ps(FLT_MIN);

	// We will unroll the for-loop by for, thus doing
	// (n/4) iterations.
	const uint32_t n_sse = n & ~3ULL;

	__m128i sse_idx = _mm_set_epi32(3, 2, 1, 0);
	const __m128i sse_4 = _mm_set1_epi32(4);

	for (uint32_t i = 0; i < n_sse; i += 4) {
		const __m128 sse_v = _mm_load_ps(&buf[i]);
		const __m128 sse_cmp_min = _mm_cmplt_ps(sse_v, sse_min);
		const __m128 sse_cmp_max = _mm_cmpgt_ps(sse_v, sse_max);

		sse_min = _mm_blendv_ps(sse_min, sse_v, sse_cmp_min);
		sse_max = _mm_blendv_ps(sse_max, sse_v, sse_cmp_max);

		sse_idx_min = (__m128i) _mm_blendv_ps((__m128) sse_idx_min, (__m128) sse_idx, (__m128) sse_cmp_min); 
		sse_idx_max = (__m128i) _mm_blendv_ps((__m128) sse_idx_max, (__m128) sse_idx, (__m128) sse_cmp_max); 

		sse_idx = _mm_add_epi32(sse_idx, sse_4);
	}

	// SSE reduction
	float __attribute__((aligned(16))) mins[4];
	float __attribute__((aligned(16))) maxs[4];
	_mm_store_ps(mins, sse_min);
	_mm_store_ps(maxs, sse_max);

	float min = mins[0];
	float max = maxs[0];
	uint32_t idx_min = _mm_extract_epi32(sse_idx_min, 0);
	uint32_t idx_max = _mm_extract_epi32(sse_idx_max, 0);
	// Unrolled by GCC
	for (int i = 1; i < 4; i++) {
		float v = mins[i];
		if (v < min) {
			min = v;
			idx_min = _mm_extract_epi32(sse_idx_min, i);
		}
		v = maxs[i];
		if (v > max) {
			max = v;
			idx_max = _mm_extract_epi32(sse_idx_max, i);
		}
	}


	// Epilogue
	for (uint32_t i = n_sse; i < n; i++) {
		const float v = buf[i];
		if (v < min) {
			min = v;
			idx_min = i;
		}
		if (v > max) {
			max = v;
			idx_max = i;
		}
	}

	*idx_min_ = idx_min;
	*min_ = min;
	*idx_max_ = idx_max;
	*max_ = max;
}
Beispiel #23
0
void FLAC__precompute_partition_info_sums_intrin_sse2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
		uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
{
	const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
	uint32_t partitions = 1u << max_partition_order;

	FLAC__ASSERT(default_partition_samples > predictor_order);

	/* first do max_partition_order */
	{
		const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
		uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);

		if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m128i mm_sum = _mm_setzero_si128();
				uint32_t e1, e3;
				end += default_partition_samples;

				e1 = (residual_sample + 3) & ~3; e3 = end & ~3;
				if(e1 > end)
					e1 = end; /* try flac -l 1 -b 16 and you'll be here */

				/* assumption: residual[] is properly aligned so (residual + e1) is properly aligned too and _mm_loadu_si128() is fast */
				for( ; residual_sample < e1; residual_sample++) {
					__m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				for( ; residual_sample < e3; residual_sample+=4) {
					__m128i mm_res = local_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					mm_sum = _mm_add_epi32(mm_sum, mm_res);
				}

				mm_sum = _mm_add_epi32(mm_sum, _mm_shuffle_epi32(mm_sum, _MM_SHUFFLE(1,0,3,2)));
				mm_sum = _mm_add_epi32(mm_sum, _mm_shufflelo_epi16(mm_sum, _MM_SHUFFLE(1,0,3,2)));
				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(mm_sum);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
				abs_residual_partition_sums[partition] &= 0xFFFFFFFF;
#endif
			}
		}
		else { /* have to pessimistically use 64 bits for accumulator */
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m128i mm_sum = _mm_setzero_si128();
				uint32_t e1, e3;
				end += default_partition_samples;

				e1 = (residual_sample + 1) & ~1; e3 = end & ~1;
				FLAC__ASSERT(e1 <= end);

				for( ; residual_sample < e1; residual_sample++) {
					__m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample])); /*  0   0   0  |r0|  ==   00   |r0_64| */
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				for( ; residual_sample < e3; residual_sample+=2) {
					__m128i mm_res = local_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample))); /*  0   0  |r1|   |r0| */
					mm_res = _mm_shuffle_epi32(mm_res, _MM_SHUFFLE(3,1,2,0)); /* 0  |r1|  0  |r0|  ==  |r1_64|  |r0_64|  */
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i mm_res = local_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					mm_sum = _mm_add_epi64(mm_sum, mm_res);
				}

				mm_sum = _mm_add_epi64(mm_sum, _mm_srli_si128(mm_sum, 8));
				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), mm_sum);
			}
		}
	}

	/* now merge partitions for lower orders */
	{
		uint32_t from_partition = 0, to_partition = partitions;
		int partition_order;
		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
			uint32_t i;
			partitions >>= 1;
			for(i = 0; i < partitions; i++) {
				abs_residual_partition_sums[to_partition++] =
					abs_residual_partition_sums[from_partition  ] +
					abs_residual_partition_sums[from_partition+1];
				from_partition += 2;
			}
		}
	}
}
/**
 * salsa20_8(B):
 * Apply the salsa20/8 core to the provided block.
 */
static void
salsa20_8(__m128i B[4])
{
    __m128i X0, X1, X2, X3;
    __m128i T;
    size_t i;

    X0 = B[0];
    X1 = B[1];
    X2 = B[2];
    X3 = B[3];

    for (i = 0; i < 8; i += 2) {
        /* Operate on "columns". */
        T = _mm_add_epi32(X0, X3);
        X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 7));
        X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 25));
        T = _mm_add_epi32(X1, X0);
        X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
        X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
        T = _mm_add_epi32(X2, X1);
        X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 13));
        X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 19));
        T = _mm_add_epi32(X3, X2);
        X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
        X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

        /* Rearrange data. */
        X1 = _mm_shuffle_epi32(X1, 0x93);
        X2 = _mm_shuffle_epi32(X2, 0x4E);
        X3 = _mm_shuffle_epi32(X3, 0x39);

        /* Operate on "rows". */
        T = _mm_add_epi32(X0, X1);
        X3 = _mm_xor_si128(X3, _mm_slli_epi32(T, 7));
        X3 = _mm_xor_si128(X3, _mm_srli_epi32(T, 25));
        T = _mm_add_epi32(X3, X0);
        X2 = _mm_xor_si128(X2, _mm_slli_epi32(T, 9));
        X2 = _mm_xor_si128(X2, _mm_srli_epi32(T, 23));
        T = _mm_add_epi32(X2, X3);
        X1 = _mm_xor_si128(X1, _mm_slli_epi32(T, 13));
        X1 = _mm_xor_si128(X1, _mm_srli_epi32(T, 19));
        T = _mm_add_epi32(X1, X2);
        X0 = _mm_xor_si128(X0, _mm_slli_epi32(T, 18));
        X0 = _mm_xor_si128(X0, _mm_srli_epi32(T, 14));

        /* Rearrange data. */
        X1 = _mm_shuffle_epi32(X1, 0x39);
        X2 = _mm_shuffle_epi32(X2, 0x4E);
        X3 = _mm_shuffle_epi32(X3, 0x93);
    }

    B[0] = _mm_add_epi32(B[0], X0);
    B[1] = _mm_add_epi32(B[1], X1);
    B[2] = _mm_add_epi32(B[2], X2);
    B[3] = _mm_add_epi32(B[3], X3);
}
Beispiel #25
0
void av1_fadst32_new_sse4_1(const __m128i *input, __m128i *output,
                            const int8_t *cos_bit, const int8_t *stage_range) {
  const int txfm_size = 32;
  const int num_per_128 = 4;
  const int32_t *cospi;
  __m128i buf0[32];
  __m128i buf1[32];
  int col_num = txfm_size / num_per_128;
  int bit;
  int col;
  (void)stage_range;
  for (col = 0; col < col_num; col++) {
    // stage 0;
    int32_t stage_idx = 0;
    buf0[0] = input[0 * col_num + col];
    buf0[1] = input[1 * col_num + col];
    buf0[2] = input[2 * col_num + col];
    buf0[3] = input[3 * col_num + col];
    buf0[4] = input[4 * col_num + col];
    buf0[5] = input[5 * col_num + col];
    buf0[6] = input[6 * col_num + col];
    buf0[7] = input[7 * col_num + col];
    buf0[8] = input[8 * col_num + col];
    buf0[9] = input[9 * col_num + col];
    buf0[10] = input[10 * col_num + col];
    buf0[11] = input[11 * col_num + col];
    buf0[12] = input[12 * col_num + col];
    buf0[13] = input[13 * col_num + col];
    buf0[14] = input[14 * col_num + col];
    buf0[15] = input[15 * col_num + col];
    buf0[16] = input[16 * col_num + col];
    buf0[17] = input[17 * col_num + col];
    buf0[18] = input[18 * col_num + col];
    buf0[19] = input[19 * col_num + col];
    buf0[20] = input[20 * col_num + col];
    buf0[21] = input[21 * col_num + col];
    buf0[22] = input[22 * col_num + col];
    buf0[23] = input[23 * col_num + col];
    buf0[24] = input[24 * col_num + col];
    buf0[25] = input[25 * col_num + col];
    buf0[26] = input[26 * col_num + col];
    buf0[27] = input[27 * col_num + col];
    buf0[28] = input[28 * col_num + col];
    buf0[29] = input[29 * col_num + col];
    buf0[30] = input[30 * col_num + col];
    buf0[31] = input[31 * col_num + col];

    // stage 1
    stage_idx++;
    buf1[0] = buf0[31];
    buf1[1] = buf0[0];
    buf1[2] = buf0[29];
    buf1[3] = buf0[2];
    buf1[4] = buf0[27];
    buf1[5] = buf0[4];
    buf1[6] = buf0[25];
    buf1[7] = buf0[6];
    buf1[8] = buf0[23];
    buf1[9] = buf0[8];
    buf1[10] = buf0[21];
    buf1[11] = buf0[10];
    buf1[12] = buf0[19];
    buf1[13] = buf0[12];
    buf1[14] = buf0[17];
    buf1[15] = buf0[14];
    buf1[16] = buf0[15];
    buf1[17] = buf0[16];
    buf1[18] = buf0[13];
    buf1[19] = buf0[18];
    buf1[20] = buf0[11];
    buf1[21] = buf0[20];
    buf1[22] = buf0[9];
    buf1[23] = buf0[22];
    buf1[24] = buf0[7];
    buf1[25] = buf0[24];
    buf1[26] = buf0[5];
    buf1[27] = buf0[26];
    buf1[28] = buf0[3];
    buf1[29] = buf0[28];
    buf1[30] = buf0[1];
    buf1[31] = buf0[30];

    // stage 2
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    btf_32_sse4_1_type0(cospi[1], cospi[63], buf1[0], buf1[1], buf0[0], buf0[1],
                        bit);
    btf_32_sse4_1_type0(cospi[5], cospi[59], buf1[2], buf1[3], buf0[2], buf0[3],
                        bit);
    btf_32_sse4_1_type0(cospi[9], cospi[55], buf1[4], buf1[5], buf0[4], buf0[5],
                        bit);
    btf_32_sse4_1_type0(cospi[13], cospi[51], buf1[6], buf1[7], buf0[6],
                        buf0[7], bit);
    btf_32_sse4_1_type0(cospi[17], cospi[47], buf1[8], buf1[9], buf0[8],
                        buf0[9], bit);
    btf_32_sse4_1_type0(cospi[21], cospi[43], buf1[10], buf1[11], buf0[10],
                        buf0[11], bit);
    btf_32_sse4_1_type0(cospi[25], cospi[39], buf1[12], buf1[13], buf0[12],
                        buf0[13], bit);
    btf_32_sse4_1_type0(cospi[29], cospi[35], buf1[14], buf1[15], buf0[14],
                        buf0[15], bit);
    btf_32_sse4_1_type0(cospi[33], cospi[31], buf1[16], buf1[17], buf0[16],
                        buf0[17], bit);
    btf_32_sse4_1_type0(cospi[37], cospi[27], buf1[18], buf1[19], buf0[18],
                        buf0[19], bit);
    btf_32_sse4_1_type0(cospi[41], cospi[23], buf1[20], buf1[21], buf0[20],
                        buf0[21], bit);
    btf_32_sse4_1_type0(cospi[45], cospi[19], buf1[22], buf1[23], buf0[22],
                        buf0[23], bit);
    btf_32_sse4_1_type0(cospi[49], cospi[15], buf1[24], buf1[25], buf0[24],
                        buf0[25], bit);
    btf_32_sse4_1_type0(cospi[53], cospi[11], buf1[26], buf1[27], buf0[26],
                        buf0[27], bit);
    btf_32_sse4_1_type0(cospi[57], cospi[7], buf1[28], buf1[29], buf0[28],
                        buf0[29], bit);
    btf_32_sse4_1_type0(cospi[61], cospi[3], buf1[30], buf1[31], buf0[30],
                        buf0[31], bit);

    // stage 3
    stage_idx++;
    buf1[0] = _mm_add_epi32(buf0[0], buf0[16]);
    buf1[16] = _mm_sub_epi32(buf0[0], buf0[16]);
    buf1[1] = _mm_add_epi32(buf0[1], buf0[17]);
    buf1[17] = _mm_sub_epi32(buf0[1], buf0[17]);
    buf1[2] = _mm_add_epi32(buf0[2], buf0[18]);
    buf1[18] = _mm_sub_epi32(buf0[2], buf0[18]);
    buf1[3] = _mm_add_epi32(buf0[3], buf0[19]);
    buf1[19] = _mm_sub_epi32(buf0[3], buf0[19]);
    buf1[4] = _mm_add_epi32(buf0[4], buf0[20]);
    buf1[20] = _mm_sub_epi32(buf0[4], buf0[20]);
    buf1[5] = _mm_add_epi32(buf0[5], buf0[21]);
    buf1[21] = _mm_sub_epi32(buf0[5], buf0[21]);
    buf1[6] = _mm_add_epi32(buf0[6], buf0[22]);
    buf1[22] = _mm_sub_epi32(buf0[6], buf0[22]);
    buf1[7] = _mm_add_epi32(buf0[7], buf0[23]);
    buf1[23] = _mm_sub_epi32(buf0[7], buf0[23]);
    buf1[8] = _mm_add_epi32(buf0[8], buf0[24]);
    buf1[24] = _mm_sub_epi32(buf0[8], buf0[24]);
    buf1[9] = _mm_add_epi32(buf0[9], buf0[25]);
    buf1[25] = _mm_sub_epi32(buf0[9], buf0[25]);
    buf1[10] = _mm_add_epi32(buf0[10], buf0[26]);
    buf1[26] = _mm_sub_epi32(buf0[10], buf0[26]);
    buf1[11] = _mm_add_epi32(buf0[11], buf0[27]);
    buf1[27] = _mm_sub_epi32(buf0[11], buf0[27]);
    buf1[12] = _mm_add_epi32(buf0[12], buf0[28]);
    buf1[28] = _mm_sub_epi32(buf0[12], buf0[28]);
    buf1[13] = _mm_add_epi32(buf0[13], buf0[29]);
    buf1[29] = _mm_sub_epi32(buf0[13], buf0[29]);
    buf1[14] = _mm_add_epi32(buf0[14], buf0[30]);
    buf1[30] = _mm_sub_epi32(buf0[14], buf0[30]);
    buf1[15] = _mm_add_epi32(buf0[15], buf0[31]);
    buf1[31] = _mm_sub_epi32(buf0[15], buf0[31]);

    // stage 4
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    buf0[0] = buf1[0];
    buf0[1] = buf1[1];
    buf0[2] = buf1[2];
    buf0[3] = buf1[3];
    buf0[4] = buf1[4];
    buf0[5] = buf1[5];
    buf0[6] = buf1[6];
    buf0[7] = buf1[7];
    buf0[8] = buf1[8];
    buf0[9] = buf1[9];
    buf0[10] = buf1[10];
    buf0[11] = buf1[11];
    buf0[12] = buf1[12];
    buf0[13] = buf1[13];
    buf0[14] = buf1[14];
    buf0[15] = buf1[15];
    btf_32_sse4_1_type0(cospi[4], cospi[60], buf1[16], buf1[17], buf0[16],
                        buf0[17], bit);
    btf_32_sse4_1_type0(cospi[20], cospi[44], buf1[18], buf1[19], buf0[18],
                        buf0[19], bit);
    btf_32_sse4_1_type0(cospi[36], cospi[28], buf1[20], buf1[21], buf0[20],
                        buf0[21], bit);
    btf_32_sse4_1_type0(cospi[52], cospi[12], buf1[22], buf1[23], buf0[22],
                        buf0[23], bit);
    btf_32_sse4_1_type0(-cospi[60], cospi[4], buf1[24], buf1[25], buf0[24],
                        buf0[25], bit);
    btf_32_sse4_1_type0(-cospi[44], cospi[20], buf1[26], buf1[27], buf0[26],
                        buf0[27], bit);
    btf_32_sse4_1_type0(-cospi[28], cospi[36], buf1[28], buf1[29], buf0[28],
                        buf0[29], bit);
    btf_32_sse4_1_type0(-cospi[12], cospi[52], buf1[30], buf1[31], buf0[30],
                        buf0[31], bit);

    // stage 5
    stage_idx++;
    buf1[0] = _mm_add_epi32(buf0[0], buf0[8]);
    buf1[8] = _mm_sub_epi32(buf0[0], buf0[8]);
    buf1[1] = _mm_add_epi32(buf0[1], buf0[9]);
    buf1[9] = _mm_sub_epi32(buf0[1], buf0[9]);
    buf1[2] = _mm_add_epi32(buf0[2], buf0[10]);
    buf1[10] = _mm_sub_epi32(buf0[2], buf0[10]);
    buf1[3] = _mm_add_epi32(buf0[3], buf0[11]);
    buf1[11] = _mm_sub_epi32(buf0[3], buf0[11]);
    buf1[4] = _mm_add_epi32(buf0[4], buf0[12]);
    buf1[12] = _mm_sub_epi32(buf0[4], buf0[12]);
    buf1[5] = _mm_add_epi32(buf0[5], buf0[13]);
    buf1[13] = _mm_sub_epi32(buf0[5], buf0[13]);
    buf1[6] = _mm_add_epi32(buf0[6], buf0[14]);
    buf1[14] = _mm_sub_epi32(buf0[6], buf0[14]);
    buf1[7] = _mm_add_epi32(buf0[7], buf0[15]);
    buf1[15] = _mm_sub_epi32(buf0[7], buf0[15]);
    buf1[16] = _mm_add_epi32(buf0[16], buf0[24]);
    buf1[24] = _mm_sub_epi32(buf0[16], buf0[24]);
    buf1[17] = _mm_add_epi32(buf0[17], buf0[25]);
    buf1[25] = _mm_sub_epi32(buf0[17], buf0[25]);
    buf1[18] = _mm_add_epi32(buf0[18], buf0[26]);
    buf1[26] = _mm_sub_epi32(buf0[18], buf0[26]);
    buf1[19] = _mm_add_epi32(buf0[19], buf0[27]);
    buf1[27] = _mm_sub_epi32(buf0[19], buf0[27]);
    buf1[20] = _mm_add_epi32(buf0[20], buf0[28]);
    buf1[28] = _mm_sub_epi32(buf0[20], buf0[28]);
    buf1[21] = _mm_add_epi32(buf0[21], buf0[29]);
    buf1[29] = _mm_sub_epi32(buf0[21], buf0[29]);
    buf1[22] = _mm_add_epi32(buf0[22], buf0[30]);
    buf1[30] = _mm_sub_epi32(buf0[22], buf0[30]);
    buf1[23] = _mm_add_epi32(buf0[23], buf0[31]);
    buf1[31] = _mm_sub_epi32(buf0[23], buf0[31]);

    // stage 6
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    buf0[0] = buf1[0];
    buf0[1] = buf1[1];
    buf0[2] = buf1[2];
    buf0[3] = buf1[3];
    buf0[4] = buf1[4];
    buf0[5] = buf1[5];
    buf0[6] = buf1[6];
    buf0[7] = buf1[7];
    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[8], buf1[9], buf0[8], buf0[9],
                        bit);
    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[10], buf1[11], buf0[10],
                        buf0[11], bit);
    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[12], buf1[13], buf0[12],
                        buf0[13], bit);
    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[14], buf1[15], buf0[14],
                        buf0[15], bit);
    buf0[16] = buf1[16];
    buf0[17] = buf1[17];
    buf0[18] = buf1[18];
    buf0[19] = buf1[19];
    buf0[20] = buf1[20];
    buf0[21] = buf1[21];
    buf0[22] = buf1[22];
    buf0[23] = buf1[23];
    btf_32_sse4_1_type0(cospi[8], cospi[56], buf1[24], buf1[25], buf0[24],
                        buf0[25], bit);
    btf_32_sse4_1_type0(cospi[40], cospi[24], buf1[26], buf1[27], buf0[26],
                        buf0[27], bit);
    btf_32_sse4_1_type0(-cospi[56], cospi[8], buf1[28], buf1[29], buf0[28],
                        buf0[29], bit);
    btf_32_sse4_1_type0(-cospi[24], cospi[40], buf1[30], buf1[31], buf0[30],
                        buf0[31], bit);

    // stage 7
    stage_idx++;
    buf1[0] = _mm_add_epi32(buf0[0], buf0[4]);
    buf1[4] = _mm_sub_epi32(buf0[0], buf0[4]);
    buf1[1] = _mm_add_epi32(buf0[1], buf0[5]);
    buf1[5] = _mm_sub_epi32(buf0[1], buf0[5]);
    buf1[2] = _mm_add_epi32(buf0[2], buf0[6]);
    buf1[6] = _mm_sub_epi32(buf0[2], buf0[6]);
    buf1[3] = _mm_add_epi32(buf0[3], buf0[7]);
    buf1[7] = _mm_sub_epi32(buf0[3], buf0[7]);
    buf1[8] = _mm_add_epi32(buf0[8], buf0[12]);
    buf1[12] = _mm_sub_epi32(buf0[8], buf0[12]);
    buf1[9] = _mm_add_epi32(buf0[9], buf0[13]);
    buf1[13] = _mm_sub_epi32(buf0[9], buf0[13]);
    buf1[10] = _mm_add_epi32(buf0[10], buf0[14]);
    buf1[14] = _mm_sub_epi32(buf0[10], buf0[14]);
    buf1[11] = _mm_add_epi32(buf0[11], buf0[15]);
    buf1[15] = _mm_sub_epi32(buf0[11], buf0[15]);
    buf1[16] = _mm_add_epi32(buf0[16], buf0[20]);
    buf1[20] = _mm_sub_epi32(buf0[16], buf0[20]);
    buf1[17] = _mm_add_epi32(buf0[17], buf0[21]);
    buf1[21] = _mm_sub_epi32(buf0[17], buf0[21]);
    buf1[18] = _mm_add_epi32(buf0[18], buf0[22]);
    buf1[22] = _mm_sub_epi32(buf0[18], buf0[22]);
    buf1[19] = _mm_add_epi32(buf0[19], buf0[23]);
    buf1[23] = _mm_sub_epi32(buf0[19], buf0[23]);
    buf1[24] = _mm_add_epi32(buf0[24], buf0[28]);
    buf1[28] = _mm_sub_epi32(buf0[24], buf0[28]);
    buf1[25] = _mm_add_epi32(buf0[25], buf0[29]);
    buf1[29] = _mm_sub_epi32(buf0[25], buf0[29]);
    buf1[26] = _mm_add_epi32(buf0[26], buf0[30]);
    buf1[30] = _mm_sub_epi32(buf0[26], buf0[30]);
    buf1[27] = _mm_add_epi32(buf0[27], buf0[31]);
    buf1[31] = _mm_sub_epi32(buf0[27], buf0[31]);

    // stage 8
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    buf0[0] = buf1[0];
    buf0[1] = buf1[1];
    buf0[2] = buf1[2];
    buf0[3] = buf1[3];
    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[4], buf1[5], buf0[4],
                        buf0[5], bit);
    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[6], buf1[7], buf0[6],
                        buf0[7], bit);
    buf0[8] = buf1[8];
    buf0[9] = buf1[9];
    buf0[10] = buf1[10];
    buf0[11] = buf1[11];
    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[12], buf1[13], buf0[12],
                        buf0[13], bit);
    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[14], buf1[15], buf0[14],
                        buf0[15], bit);
    buf0[16] = buf1[16];
    buf0[17] = buf1[17];
    buf0[18] = buf1[18];
    buf0[19] = buf1[19];
    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[20], buf1[21], buf0[20],
                        buf0[21], bit);
    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[22], buf1[23], buf0[22],
                        buf0[23], bit);
    buf0[24] = buf1[24];
    buf0[25] = buf1[25];
    buf0[26] = buf1[26];
    buf0[27] = buf1[27];
    btf_32_sse4_1_type0(cospi[16], cospi[48], buf1[28], buf1[29], buf0[28],
                        buf0[29], bit);
    btf_32_sse4_1_type0(-cospi[48], cospi[16], buf1[30], buf1[31], buf0[30],
                        buf0[31], bit);

    // stage 9
    stage_idx++;
    buf1[0] = _mm_add_epi32(buf0[0], buf0[2]);
    buf1[2] = _mm_sub_epi32(buf0[0], buf0[2]);
    buf1[1] = _mm_add_epi32(buf0[1], buf0[3]);
    buf1[3] = _mm_sub_epi32(buf0[1], buf0[3]);
    buf1[4] = _mm_add_epi32(buf0[4], buf0[6]);
    buf1[6] = _mm_sub_epi32(buf0[4], buf0[6]);
    buf1[5] = _mm_add_epi32(buf0[5], buf0[7]);
    buf1[7] = _mm_sub_epi32(buf0[5], buf0[7]);
    buf1[8] = _mm_add_epi32(buf0[8], buf0[10]);
    buf1[10] = _mm_sub_epi32(buf0[8], buf0[10]);
    buf1[9] = _mm_add_epi32(buf0[9], buf0[11]);
    buf1[11] = _mm_sub_epi32(buf0[9], buf0[11]);
    buf1[12] = _mm_add_epi32(buf0[12], buf0[14]);
    buf1[14] = _mm_sub_epi32(buf0[12], buf0[14]);
    buf1[13] = _mm_add_epi32(buf0[13], buf0[15]);
    buf1[15] = _mm_sub_epi32(buf0[13], buf0[15]);
    buf1[16] = _mm_add_epi32(buf0[16], buf0[18]);
    buf1[18] = _mm_sub_epi32(buf0[16], buf0[18]);
    buf1[17] = _mm_add_epi32(buf0[17], buf0[19]);
    buf1[19] = _mm_sub_epi32(buf0[17], buf0[19]);
    buf1[20] = _mm_add_epi32(buf0[20], buf0[22]);
    buf1[22] = _mm_sub_epi32(buf0[20], buf0[22]);
    buf1[21] = _mm_add_epi32(buf0[21], buf0[23]);
    buf1[23] = _mm_sub_epi32(buf0[21], buf0[23]);
    buf1[24] = _mm_add_epi32(buf0[24], buf0[26]);
    buf1[26] = _mm_sub_epi32(buf0[24], buf0[26]);
    buf1[25] = _mm_add_epi32(buf0[25], buf0[27]);
    buf1[27] = _mm_sub_epi32(buf0[25], buf0[27]);
    buf1[28] = _mm_add_epi32(buf0[28], buf0[30]);
    buf1[30] = _mm_sub_epi32(buf0[28], buf0[30]);
    buf1[29] = _mm_add_epi32(buf0[29], buf0[31]);
    buf1[31] = _mm_sub_epi32(buf0[29], buf0[31]);

    // stage 10
    stage_idx++;
    bit = cos_bit[stage_idx];
    cospi = cospi_arr[bit - cos_bit_min];
    buf0[0] = buf1[0];
    buf0[1] = buf1[1];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[2], buf1[3], buf0[2],
                        buf0[3], bit);
    buf0[4] = buf1[4];
    buf0[5] = buf1[5];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[6], buf1[7], buf0[6],
                        buf0[7], bit);
    buf0[8] = buf1[8];
    buf0[9] = buf1[9];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[10], buf1[11], buf0[10],
                        buf0[11], bit);
    buf0[12] = buf1[12];
    buf0[13] = buf1[13];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[14], buf1[15], buf0[14],
                        buf0[15], bit);
    buf0[16] = buf1[16];
    buf0[17] = buf1[17];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[18], buf1[19], buf0[18],
                        buf0[19], bit);
    buf0[20] = buf1[20];
    buf0[21] = buf1[21];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[22], buf1[23], buf0[22],
                        buf0[23], bit);
    buf0[24] = buf1[24];
    buf0[25] = buf1[25];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[26], buf1[27], buf0[26],
                        buf0[27], bit);
    buf0[28] = buf1[28];
    buf0[29] = buf1[29];
    btf_32_sse4_1_type0(cospi[32], cospi[32], buf1[30], buf1[31], buf0[30],
                        buf0[31], bit);

    // stage 11
    stage_idx++;
    buf1[0] = buf0[0];
    buf1[1] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[16]);
    buf1[2] = buf0[24];
    buf1[3] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[8]);
    buf1[4] = buf0[12];
    buf1[5] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[28]);
    buf1[6] = buf0[20];
    buf1[7] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[4]);
    buf1[8] = buf0[6];
    buf1[9] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[22]);
    buf1[10] = buf0[30];
    buf1[11] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[14]);
    buf1[12] = buf0[10];
    buf1[13] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[26]);
    buf1[14] = buf0[18];
    buf1[15] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[2]);
    buf1[16] = buf0[3];
    buf1[17] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[19]);
    buf1[18] = buf0[27];
    buf1[19] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[11]);
    buf1[20] = buf0[15];
    buf1[21] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[31]);
    buf1[22] = buf0[23];
    buf1[23] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[7]);
    buf1[24] = buf0[5];
    buf1[25] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[21]);
    buf1[26] = buf0[29];
    buf1[27] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[13]);
    buf1[28] = buf0[9];
    buf1[29] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[25]);
    buf1[30] = buf0[17];
    buf1[31] = _mm_sub_epi32(_mm_set1_epi32(0), buf0[1]);

    output[0 * col_num + col] = buf1[0];
    output[1 * col_num + col] = buf1[1];
    output[2 * col_num + col] = buf1[2];
    output[3 * col_num + col] = buf1[3];
    output[4 * col_num + col] = buf1[4];
    output[5 * col_num + col] = buf1[5];
    output[6 * col_num + col] = buf1[6];
    output[7 * col_num + col] = buf1[7];
    output[8 * col_num + col] = buf1[8];
    output[9 * col_num + col] = buf1[9];
    output[10 * col_num + col] = buf1[10];
    output[11 * col_num + col] = buf1[11];
    output[12 * col_num + col] = buf1[12];
    output[13 * col_num + col] = buf1[13];
    output[14 * col_num + col] = buf1[14];
    output[15 * col_num + col] = buf1[15];
    output[16 * col_num + col] = buf1[16];
    output[17 * col_num + col] = buf1[17];
    output[18 * col_num + col] = buf1[18];
    output[19 * col_num + col] = buf1[19];
    output[20 * col_num + col] = buf1[20];
    output[21 * col_num + col] = buf1[21];
    output[22 * col_num + col] = buf1[22];
    output[23 * col_num + col] = buf1[23];
    output[24 * col_num + col] = buf1[24];
    output[25 * col_num + col] = buf1[25];
    output[26 * col_num + col] = buf1[26];
    output[27 * col_num + col] = buf1[27];
    output[28 * col_num + col] = buf1[28];
    output[29 * col_num + col] = buf1[29];
    output[30 * col_num + col] = buf1[30];
    output[31 * col_num + col] = buf1[31];
  }
}
void nb_kernel430_sse2_double(int *           p_nri,
							int *           iinr,
							int *           jindex,
							int *           jjnr,
							int *           shift,
							double *         shiftvec,
							double *         fshift,
							int *           gid,
							double *         pos,
							double *         faction,
							double *         charge,
							double *         p_facel,
							double *         p_krf,
							double *         p_crf,
							double *         Vc,
							int *           type,
							int *           p_ntype,
							double *         vdwparam,
							double *         Vvdw,
							double *         p_tabscale,
							double *         VFtab,
							double *         invsqrta,
							double *         dvda,
							double *         p_gbtabscale,
							double *         GBtab,
							int *           p_nthreads,
							int *           count,
							void *          mtx,
							int *           outeriter,
							int *           inneriter,
							double *         work)
{
	int           nri,ntype,nthreads,offset,tj,tj2,nti;
	int           n,ii,is3,ii3,k,nj0,nj1,jnr1,jnr2,j13,j23,ggid;
	double        facel,krf,crf,tabscl,gbtabscl,vct,vdwt,vgbt,nt1,nt2;
	double        shX,shY,shZ,isai_d,dva;
	gmx_gbdata_t *gbdata;
	float *        gpol;

	__m128d       ix,iy,iz,jx,jy,jz;
	__m128d		  dx,dy,dz,t1,t2,t3;
	__m128d		  fix,fiy,fiz,rsq11,rinv,r,fscal,rt,eps,eps2;
	__m128d		  q,iq,qq,isai,isaj,isaprod,vcoul,gbscale,dvdai,dvdaj;
	__m128d       Y,F,G,H,Fp,VV,FF,vgb,fijC,fijD,fijR,dvdatmp,dvdasum,vctot,n0d;
	__m128d		  xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7,xmm8;
	__m128d       c6,c12,Vvdw6,Vvdw12,Vvdwtmp,Vvdwtot,vgbtot,rinvsq,rinvsix;
	__m128d       fac,tabscale,gbtabscale;
	__m128i       n0,nnn;
	
	const __m128d neg    = {-1.0,-1.0};
	const __m128d zero   = {0.0,0.0};
	const __m128d half   = {0.5,0.5};
	const __m128d two    = {2.0,2.0};
	const __m128d three  = {3.0,3.0};
	const __m128d six    = {6.0,6.0};
	const __m128d twelwe = {12.0,12.0};
	
	const __m128i four   = _mm_set_epi32(4,4,4,4);
	
	gbdata     = (gmx_gbdata_t *)work;
	gpol       = gbdata->gpol;
	
	nri        = *p_nri;
	ntype      = *p_ntype;
	nthreads   = *p_nthreads; 
    facel      = (*p_facel) * (1.0 - (1.0/gbdata->gb_epsilon_solvent));       
	krf        = *p_krf;
	crf        = *p_crf;
	tabscl     = *p_tabscale;
	gbtabscl   = *p_gbtabscale;
	nj1        = 0;
	
	/* Splat variables */
	fac        = _mm_load1_pd(&facel);
	tabscale   = _mm_load1_pd(&tabscl);
	gbtabscale = _mm_load1_pd(&gbtabscl);
	
	/* Keep compiler happy */
	Vvdwtmp = _mm_setzero_pd();
	Vvdwtot = _mm_setzero_pd();
	dvdatmp = _mm_setzero_pd();
	dvdaj   = _mm_setzero_pd();
	isaj    = _mm_setzero_pd();
	vcoul   = _mm_setzero_pd();
	vgb     = _mm_setzero_pd();
	t1      = _mm_setzero_pd();
	t2      = _mm_setzero_pd();
	t3      = _mm_setzero_pd();
	xmm1    = _mm_setzero_pd();
	xmm2    = _mm_setzero_pd();
	xmm3    = _mm_setzero_pd();
	xmm4    = _mm_setzero_pd();
	jnr1    = jnr2 = 0;
	j13     = j23  = 0;
	
	for(n=0;n<nri;n++)
	{
		is3     = 3*shift[n];
		shX     = shiftvec[is3];
		shY     = shiftvec[is3+1];
		shZ     = shiftvec[is3+2];
		
		nj0     = jindex[n];      
        nj1     = jindex[n+1];  
		offset  = (nj1-nj0)%2;
		
		ii      = iinr[n];
		ii3     = ii*3;
		
		ix      = _mm_set1_pd(shX+pos[ii3+0]);
		iy      = _mm_set1_pd(shX+pos[ii3+1]);
		iz      = _mm_set1_pd(shX+pos[ii3+2]); 
		q       = _mm_set1_pd(charge[ii]);
		
		iq      = _mm_mul_pd(fac,q); 
		isai_d  = invsqrta[ii];
		isai    = _mm_load1_pd(&isai_d);
		
		nti      = 2*ntype*type[ii];
		
		fix     = _mm_setzero_pd();
		fiy     = _mm_setzero_pd();
		fiz     = _mm_setzero_pd();
		dvdasum = _mm_setzero_pd();
		vctot   = _mm_setzero_pd();
		vgbtot  = _mm_setzero_pd();
		Vvdwtot = _mm_setzero_pd();
		
		for(k=nj0;k<nj1-offset; k+=2)
		{
			jnr1    = jjnr[k];
			jnr2    = jjnr[k+1];
			
			j13     = jnr1 * 3;
			j23     = jnr2 * 3;
			
			/* Load coordinates */
			xmm1    = _mm_loadu_pd(pos+j13); /* x1 y1 */
			xmm2    = _mm_loadu_pd(pos+j23); /* x2 y2 */
			
			xmm5    = _mm_load_sd(pos+j13+2); /* z1 - */
			xmm6    = _mm_load_sd(pos+j23+2); /* z2 - */
			
			/* transpose */
			jx      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			jy      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			jz      = _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); 
			
			/* distances */
			dx      = _mm_sub_pd(ix,jx);
			dy		= _mm_sub_pd(iy,jy);
			dz		= _mm_sub_pd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
			
			/* Load invsqrta */
			isaj	= _mm_loadl_pd(isaj,invsqrta+jnr1);
			isaj	= _mm_loadh_pd(isaj,invsqrta+jnr2);
			isaprod = _mm_mul_pd(isai,isaj);
			
			/* Load charges */
			q		= _mm_loadl_pd(q,charge+jnr1);
			q		= _mm_loadh_pd(q,charge+jnr2);
			qq		= _mm_mul_pd(iq,q);
			
			vcoul	= _mm_mul_pd(qq,rinv);
			fscal	= _mm_mul_pd(vcoul,rinv);
			qq		= _mm_mul_pd(isaprod,qq);
			qq		= _mm_mul_pd(qq,neg);
			gbscale	= _mm_mul_pd(isaprod,gbtabscale);
			
			/* Load VdW parameters */
			tj      = nti+2*type[jnr1];
			tj2     = nti+2*type[jnr2];
			
			xmm1      = _mm_loadu_pd(vdwparam+tj);
			xmm2     = _mm_loadu_pd(vdwparam+tj2);
			c6      = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0));
			c12     = _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1));
			
			/* Load dvdaj */
			dvdaj	= _mm_loadl_pd(dvdaj, dvda+jnr1);
			dvdaj	= _mm_loadh_pd(dvdaj, dvda+jnr2);
			
			/* Calculate GB table index */
			r		= _mm_mul_pd(rsq11,rinv);
			rt		= _mm_mul_pd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_pd(rt,n0d);
			eps2	= _mm_mul_pd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G		= _mm_mul_pd(G,eps);
			H		= _mm_mul_pd(H,eps2);
			Fp		= _mm_add_pd(F,G);
			Fp		= _mm_add_pd(Fp,H);
			VV		= _mm_mul_pd(Fp,eps);
			VV		= _mm_add_pd(Y,VV);
			H		= _mm_mul_pd(two,H);
			FF		= _mm_add_pd(Fp,G);
			FF		= _mm_add_pd(FF,H);
			vgb		= _mm_mul_pd(qq,VV);
			fijC	= _mm_mul_pd(qq,FF);
			fijC	= _mm_mul_pd(fijC,gbscale);
			
			dvdatmp = _mm_mul_pd(fijC,r);
			dvdatmp	= _mm_add_pd(vgb,dvdatmp);
			dvdatmp = _mm_mul_pd(dvdatmp,neg);
			dvdatmp = _mm_mul_pd(dvdatmp,half);
			dvdasum	= _mm_add_pd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_pd(dvdatmp,isaj);
			xmm1	= _mm_mul_pd(xmm1,isaj);
			dvdaj	= _mm_add_pd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			_mm_storeh_pd(dvda+jnr2,dvdaj);
			
			vctot	= _mm_add_pd(vctot,vcoul);
			vgbtot	= _mm_add_pd(vgbtot,vgb);
			
			/* Calculate VDW table index */
			rt      = _mm_mul_pd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rt);
			n0d     = _mm_cvtepi32_pd(n0);
			eps     = _mm_sub_pd(rt,n0d);
			eps2    = _mm_mul_pd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
			/* Tabulated VdW interaction - dispersion */
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			Vvdw6   = _mm_mul_pd(c6,VV);
			fijD    = _mm_mul_pd(c6,FF);
			
			/* Tabulated VdW interaction - repulsion */
			nnn     = _mm_add_epi32(nnn,four);
			
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_pd(G,eps);
			H       = _mm_mul_pd(H,eps2);
			Fp      = _mm_add_pd(F,G);
			Fp      = _mm_add_pd(Fp,H);
			VV      = _mm_mul_pd(Fp,eps);
			VV      = _mm_add_pd(Y,VV);
			xmm1    = _mm_mul_pd(two,H);
			FF      = _mm_add_pd(Fp,G);
			FF      = _mm_add_pd(FF,xmm1);
			
			Vvdw12  = _mm_mul_pd(c12,VV);
			fijR    = _mm_mul_pd(c12,FF);
			
			Vvdwtmp = _mm_add_pd(Vvdw12,Vvdw6);
			Vvdwtot = _mm_add_pd(Vvdwtot,Vvdwtmp);
			
			xmm1    = _mm_add_pd(fijD,fijR);
			xmm1    = _mm_mul_pd(xmm1,tabscale);
			xmm1    = _mm_add_pd(xmm1,fijC);
			xmm1    = _mm_sub_pd(xmm1,fscal);
			fscal   = _mm_mul_pd(xmm1,neg);
			fscal   = _mm_mul_pd(fscal,rinv);
			
			/* calculate partial force terms */
			t1		= _mm_mul_pd(fscal,dx);
			t2		= _mm_mul_pd(fscal,dy);
			t3		= _mm_mul_pd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_pd(fix,t1);
			fiy		= _mm_add_pd(fiy,t2);
			fiz		= _mm_add_pd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm1	= _mm_loadu_pd(faction+j13); /* fx1 fy1 */
			xmm2	= _mm_loadu_pd(faction+j23); /* fx2 fy2 */
			
			xmm5	= _mm_load1_pd(faction+j13+2); /* fz1 fz1 */
			xmm6	= _mm_load1_pd(faction+j23+2); /* fz2 fz2 */
			
			/* transpose */
			xmm7	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fz1 fz2 */
			xmm5	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* fx1 fx2 */
			xmm6	= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_pd(xmm5,t1);
			xmm6	= _mm_sub_pd(xmm6,t2);
			xmm7	= _mm_sub_pd(xmm7,t3);
			
			xmm1	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(0,0)); /* fx1 fy1 */
			xmm2	= _mm_shuffle_pd(xmm5,xmm6,_MM_SHUFFLE2(1,1)); /* fy1 fy2 */
			
			/* store fx and fy */
			_mm_storeu_pd(faction+j13,xmm1);
			_mm_storeu_pd(faction+j23,xmm2);
			
			/* .. then fz */
			_mm_storel_pd(faction+j13+2,xmm7);
			_mm_storel_pd(faction+j23+2,xmm7);
		}
		
		/* In double precision, offset can only be either 0 or 1 */
		if(offset!=0)
		{
			jnr1	= jjnr[k];
			j13		= jnr1*3;
			
			jx      = _mm_load_sd(pos+j13);
			jy      = _mm_load_sd(pos+j13+1);
			jz      = _mm_load_sd(pos+j13+2);
			
			isaj	= _mm_load_sd(invsqrta+jnr1);
			isaprod = _mm_mul_sd(isai,isaj);
			dvdaj	= _mm_load_sd(dvda+jnr1);
			q		= _mm_load_sd(charge+jnr1);
			qq      = _mm_mul_sd(iq,q);
			
			dx      = _mm_sub_sd(ix,jx);
			dy		= _mm_sub_sd(iy,jy);
			dz		= _mm_sub_sd(iz,jz);
			
			rsq11   = _mm_add_pd( _mm_add_pd( _mm_mul_pd(dx,dx) , _mm_mul_pd(dy,dy) ) , _mm_mul_pd(dz,dz) );
			rinv    = my_invrsq_pd(rsq11);
			
			vcoul	= _mm_mul_sd(qq,rinv);
			fscal	= _mm_mul_sd(vcoul,rinv);
			qq		= _mm_mul_sd(isaprod,qq);
			qq		= _mm_mul_sd(qq,neg);
			gbscale	= _mm_mul_sd(isaprod,gbtabscale);
			
			/* Load VdW parameters */
			tj      = nti+2*type[jnr1];
			
			c6      = _mm_load_sd(vdwparam+tj);
			c12     = _mm_load_sd(vdwparam+tj+1);
			
			/* Calculate GB table index */
			r		= _mm_mul_sd(rsq11,rinv);
			rt		= _mm_mul_sd(r,gbscale);
			n0		= _mm_cvttpd_epi32(rt);
			n0d		= _mm_cvtepi32_pd(n0);
			eps		= _mm_sub_sd(rt,n0d);
			eps2	= _mm_mul_sd(eps,eps);
			
			nnn		= _mm_slli_epi64(n0,2);
			
			xmm1	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))); 
			xmm2	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))); 
			xmm3	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,0))+2); 
			xmm4	= _mm_load_pd(GBtab+(_mm_extract_epi64(nnn,1))+2); 
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); 
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); 
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); 
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); 
			
			G		= _mm_mul_sd(G,eps);
			H		= _mm_mul_sd(H,eps2);
			Fp		= _mm_add_sd(F,G);
			Fp		= _mm_add_sd(Fp,H);
			VV		= _mm_mul_sd(Fp,eps);
			VV		= _mm_add_sd(Y,VV);
			H		= _mm_mul_sd(two,H);
			FF		= _mm_add_sd(Fp,G);
			FF		= _mm_add_sd(FF,H);
			vgb		= _mm_mul_sd(qq,VV);
			fijC	= _mm_mul_sd(qq,FF);
			fijC	= _mm_mul_sd(fijC,gbscale);
			
			dvdatmp = _mm_mul_sd(fijC,r);
			dvdatmp	= _mm_add_sd(vgb,dvdatmp);
			dvdatmp = _mm_mul_sd(dvdatmp,neg);
			dvdatmp = _mm_mul_sd(dvdatmp,half);
			dvdasum	= _mm_add_sd(dvdasum,dvdatmp);
			
			xmm1	= _mm_mul_sd(dvdatmp,isaj);
			xmm1	= _mm_mul_sd(xmm1,isaj);
			dvdaj	= _mm_add_sd(dvdaj,xmm1);
			
			/* store dvda */
			_mm_storel_pd(dvda+jnr1,dvdaj);
			
			vctot	= _mm_add_sd(vctot,vcoul);
			vgbtot	= _mm_add_sd(vgbtot,vgb);
			
			/* Calculate VDW table index */
			rt      = _mm_mul_sd(r,tabscale);
			n0      = _mm_cvttpd_epi32(rt);
			n0d     = _mm_cvtepi32_pd(n0);
			eps     = _mm_sub_sd(rt,n0d);
			eps2    = _mm_mul_sd(eps,eps);
			nnn     = _mm_slli_epi32(n0,3);
			
			/* Tabulated VdW interaction - dispersion */
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			Vvdw6   = _mm_mul_sd(c6,VV);
			fijD    = _mm_mul_sd(c6,FF);
			
			/* Tabulated VdW interaction - repulsion */
			nnn     = _mm_add_epi32(nnn,four);
			
			xmm1	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0)));   /* Y1 F1 */
			xmm2	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1)));   /* Y2 F2 */
			xmm3	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,0))+2); /* G1 H1 */
			xmm4	= _mm_load_pd(VFtab+(_mm_extract_epi64(nnn,1))+2); /* G2 H2 */
			
			Y		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(0,0)); /* Y1 Y2 */
			F		= _mm_shuffle_pd(xmm1,xmm2,_MM_SHUFFLE2(1,1)); /* F1 F2 */
			G		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(0,0)); /* G1 G2 */
			H		= _mm_shuffle_pd(xmm3,xmm4,_MM_SHUFFLE2(1,1)); /* H1 H2 */
			
			G       = _mm_mul_sd(G,eps);
			H       = _mm_mul_sd(H,eps2);
			Fp      = _mm_add_sd(F,G);
			Fp      = _mm_add_sd(Fp,H);
			VV      = _mm_mul_sd(Fp,eps);
			VV      = _mm_add_sd(Y,VV);
			xmm1    = _mm_mul_sd(two,H);
			FF      = _mm_add_sd(Fp,G);
			FF      = _mm_add_sd(FF,xmm1);
			
			Vvdw12  = _mm_mul_sd(c12,VV);
			fijR    = _mm_mul_sd(c12,FF);
			
			Vvdwtmp = _mm_add_sd(Vvdw12,Vvdw6);
			Vvdwtot = _mm_add_sd(Vvdwtot,Vvdwtmp);
			
			xmm1    = _mm_add_sd(fijD,fijR);
			xmm1    = _mm_mul_sd(xmm1,tabscale);
			xmm1    = _mm_add_sd(xmm1,fijC);
			xmm1    = _mm_sub_sd(xmm1,fscal);
			fscal   = _mm_mul_sd(xmm1,neg);
			fscal   = _mm_mul_sd(fscal,rinv);
			
			/* calculate partial force terms */
			t1		= _mm_mul_sd(fscal,dx);
			t2		= _mm_mul_sd(fscal,dy);
			t3		= _mm_mul_sd(fscal,dz);
			
			/* update the i force */
			fix		= _mm_add_sd(fix,t1);
			fiy		= _mm_add_sd(fiy,t2);
			fiz		= _mm_add_sd(fiz,t3);
			
			/* accumulate forces from memory */
			xmm5	= _mm_load_sd(faction+j13);   /* fx */
			xmm6    = _mm_load_sd(faction+j13+1); /* fy */
			xmm7    = _mm_load_sd(faction+j13+2); /* fz */
			
			/* subtract partial forces */
			xmm5	= _mm_sub_sd(xmm5,t1);
			xmm6	= _mm_sub_sd(xmm6,t2);
			xmm7	= _mm_sub_sd(xmm7,t3);
			
			/* store forces */
			_mm_store_sd(faction+j13,xmm5);
			_mm_store_sd(faction+j13+1,xmm6);
			_mm_store_sd(faction+j13+2,xmm7);
		}
		
		/* fix/fiy/fiz now contain four partial terms, that all should be
		 * added to the i particle forces
		 */
		t1		 = _mm_unpacklo_pd(t1,fix);
		t2		 = _mm_unpacklo_pd(t2,fiy);
		t3		 = _mm_unpacklo_pd(t3,fiz);
		
		fix		 = _mm_add_pd(fix,t1);
		fiy		 = _mm_add_pd(fiy,t2);
		fiz		 = _mm_add_pd(fiz,t3);
		
		fix      = _mm_shuffle_pd(fix,fix,_MM_SHUFFLE2(1,1));
		fiy      = _mm_shuffle_pd(fiy,fiy,_MM_SHUFFLE2(1,1));
		fiz      = _mm_shuffle_pd(fiz,fiz,_MM_SHUFFLE2(1,1));
		
		/* Load i forces from memory */
		xmm1     = _mm_load_sd(faction+ii3);
		xmm2     = _mm_load_sd(faction+ii3+1);
		xmm3     = _mm_load_sd(faction+ii3+2);
		
		/* Add to i force */
		fix      = _mm_add_sd(fix,xmm1);
		fiy      = _mm_add_sd(fiy,xmm2);
		fiz      = _mm_add_sd(fiz,xmm3);
		
		/* store i forces to memory */
		_mm_store_sd(faction+ii3,fix);
		_mm_store_sd(faction+ii3+1,fiy);
		_mm_store_sd(faction+ii3+2,fiz);
		
		/* now do dvda */
		dvdatmp  = _mm_unpacklo_pd(dvdatmp,dvdasum);
		dvdasum  = _mm_add_pd(dvdasum,dvdatmp);
		_mm_storeh_pd(&dva,dvdasum);
		dvda[ii] = dvda[ii] + dva*isai_d*isai_d;
		
		ggid	 = gid[n];
		
		/* Coulomb potential */
		vcoul	 = _mm_unpacklo_pd(vcoul,vctot);
		vctot	 = _mm_add_pd(vctot,vcoul);
		_mm_storeh_pd(&vct,vctot);
		Vc[ggid] = Vc[ggid] + vct;
		
		/* VdW potential */
		Vvdwtmp	 = _mm_unpacklo_pd(Vvdwtmp,Vvdwtot);
		Vvdwtot	 = _mm_add_pd(Vvdwtot,Vvdwtmp);
		_mm_storeh_pd(&vdwt,Vvdwtot);
		Vvdw[ggid] = Vvdw[ggid] + vdwt;
		
		/* GB potential */
		vgb  	 = _mm_unpacklo_pd(vgb,vgbtot);
		vgbtot	 = _mm_add_pd(vgbtot,vgb);
		_mm_storeh_pd(&vgbt,vgbtot);
		gpol[ggid] = gpol[ggid] + vgbt;
	}
	
	*outeriter   = nri;            
    *inneriter   = nj1; 	
}
Beispiel #27
0
/* Input image must be large enough to have valid pixels for the offset (dx,dy).
   I.e., with (dx,dy)=(-10,8), x-value up to -10 and y-values up to (h-1)+8 will be accessed.
   The integral image will be access with (x,y) in [-1,w)x[-1,h).

   Note also that we use 32bit for the integral image even though the values may overflow
   that range. However, the modulo-arithmetic used when computing the block sums later
   will be still correct when the block size is not too large.
 */
static void buildIntegralImage_SSE(uint32_t* integral_image, int integral_stride,
                                   const uint8_t* current_image, int current_image_stride,
				   const uint8_t* compare_image, int compare_image_stride,
				   int  w,int  hStart, int hEnd, 
				   int dx,int dy)
{
    const __m128i zero = _mm_set1_epi8(0);


    memset(integral_image -1 -integral_stride, 0, (w+1)*sizeof(uint32_t));

    for (int y=hStart;y<hEnd;y++) {
        const uint8_t* p1 = current_image +  y    *current_image_stride;
        const uint8_t* p2 = compare_image + (y+dy)*compare_image_stride + dx;

        uint32_t* out = integral_image + y*integral_stride-1;

        __m128i prevadd = _mm_set1_epi32(0);
        const int pixels_step = 16;

        *out++ = 0;

        for (int x=0 ; x<w ; x+=pixels_step)
        {
            __m128i pa, pb;
            __m128i pla, plb;
            __m128i ldiff, lldiff, lhdiff;
            __m128i ltmp,htmp;
            __m128i ladd,hadd;
            __m128i pha,phb;
            __m128i hdiff,hldiff,hhdiff;
            __m128i l2tmp,h2tmp;



            pa = _mm_loadu_si128((__m128i*)p1);
            pb = _mm_loadu_si128((__m128i*)p2);

            pla = _mm_unpacklo_epi8(pa,zero);
            plb = _mm_unpacklo_epi8(pb,zero);

            ldiff = _mm_sub_epi16(pla,plb);
            ldiff = _mm_mullo_epi16(ldiff,ldiff);

            lldiff = _mm_unpacklo_epi16(ldiff,zero);
            lhdiff = _mm_unpackhi_epi16(ldiff,zero);

            ltmp = _mm_slli_si128(lldiff, 4);
            lldiff = _mm_add_epi32(lldiff, ltmp);
            ltmp = _mm_slli_si128(lldiff, 8);
            lldiff = _mm_add_epi32(lldiff, ltmp);
            lldiff = _mm_add_epi32(lldiff, prevadd);

            ladd = _mm_shuffle_epi32(lldiff, 0xff);

            htmp = _mm_slli_si128(lhdiff, 4);
            lhdiff = _mm_add_epi32(lhdiff, htmp);
            htmp = _mm_slli_si128(lhdiff, 8);
            lhdiff = _mm_add_epi32(lhdiff, htmp);
            lhdiff = _mm_add_epi32(lhdiff, ladd);

            prevadd = _mm_shuffle_epi32(lhdiff, 0xff);

            _mm_store_si128((__m128i*)(out),  lldiff);
            _mm_store_si128((__m128i*)(out+4),lhdiff);



            pha = _mm_unpackhi_epi8(pa,zero);
            phb = _mm_unpackhi_epi8(pb,zero);
            hdiff = _mm_sub_epi16(pha,phb);

            hdiff = _mm_mullo_epi16(hdiff,hdiff);

            hldiff = _mm_unpacklo_epi16(hdiff,zero);
            hhdiff = _mm_unpackhi_epi16(hdiff,zero);
            l2tmp = _mm_slli_si128(hldiff, 4);
            hldiff = _mm_add_epi32(hldiff, l2tmp);
            l2tmp = _mm_slli_si128(hldiff, 8);
            hldiff = _mm_add_epi32(hldiff, l2tmp);
            hldiff = _mm_add_epi32(hldiff, prevadd);
            hadd = _mm_shuffle_epi32(hldiff, 0xff);
            h2tmp = _mm_slli_si128(hhdiff, 4);
            hhdiff = _mm_add_epi32(hhdiff, h2tmp);
            h2tmp = _mm_slli_si128(hhdiff, 8);
            hhdiff = _mm_add_epi32(hhdiff, h2tmp);
            hhdiff = _mm_add_epi32(hhdiff, hadd);

            prevadd = _mm_shuffle_epi32(hhdiff, 0xff);

            _mm_store_si128((__m128i*)(out+8), hldiff);
            _mm_store_si128((__m128i*)(out+12),hhdiff);


            out+=pixels_step;
            p1 +=pixels_step;
            p2 +=pixels_step;
        }

        if (y>0) {
            out = integral_image + y*integral_stride;

            for (int x=0 ; x<w ; x+=pixels_step) {
                *((__m128i*)out) = _mm_add_epi32(*(__m128i*)(out-integral_stride),
                                                 *(__m128i*)(out));

                *((__m128i*)(out+4)) = _mm_add_epi32(*(__m128i*)(out+4-integral_stride),
                                                     *(__m128i*)(out+4));

                *((__m128i*)(out+8)) = _mm_add_epi32(*(__m128i*)(out+8-integral_stride),
                                                     *(__m128i*)(out+8));

                *((__m128i*)(out+12)) = _mm_add_epi32(*(__m128i*)(out+12-integral_stride),
                                                      *(__m128i*)(out+12));

                out += 4*4;
            }
        }
    }
}
Beispiel #28
0
 FourColorVals VCALL operator+(FourColorVals b) { return FourColorVals(_mm_add_epi32(vec,b.vec)); }
Beispiel #29
0
void sincos_ps(__m128 x, __m128 *s, __m128 *c) {
  __m128 xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
  __m128i emm0, emm2, emm4;
  sign_bit_sin = x;
  x = _mm_and_ps(x, *reinterpret_cast<const __m128*>(_pi_inv_sign_mask));
  sign_bit_sin = _mm_and_ps(sign_bit_sin,
                            *reinterpret_cast<const __m128*>(_pi_sign_mask));
  y = _mm_mul_ps(x, *_ps_cephes_FOPI);
  emm2 = _mm_cvttps_epi32(y);
  emm2 = _mm_add_epi32(emm2, *_pi_1);
  emm2 = _mm_and_si128(emm2, *_pi_inv1);
  y = _mm_cvtepi32_ps(emm2);
  emm4 = emm2;
  emm0 = _mm_and_si128(emm2, *_pi_4);
  emm0 = _mm_slli_epi32(emm0, 29);
  __m128 swap_sign_bit_sin = _mm_castsi128_ps(emm0);
  emm2 = _mm_and_si128(emm2, *_pi_2);
  emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
  __m128 poly_mask = _mm_castsi128_ps(emm2);
  xmm1 = *_ps_minus_cephes_DP1;
  xmm2 = *_ps_minus_cephes_DP2;
  xmm3 = *_ps_minus_cephes_DP3;
  xmm1 = _mm_mul_ps(y, xmm1);
  xmm2 = _mm_mul_ps(y, xmm2);
  xmm3 = _mm_mul_ps(y, xmm3);
  x = _mm_add_ps(x, xmm1);
  x = _mm_add_ps(x, xmm2);
  x = _mm_add_ps(x, xmm3);
  emm4 = _mm_sub_epi32(emm4, *_pi_2);
  emm4 = _mm_andnot_si128(emm4, *_pi_4);
  emm4 = _mm_slli_epi32(emm4, 29);
  __m128 sign_bit_cos = _mm_castsi128_ps(emm4);
  sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  __m128 z = _mm_mul_ps(x, x);
  y = *_ps_coscof_p0;
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *_ps_coscof_p1);
  y = _mm_mul_ps(y, z);
  y = _mm_add_ps(y, *_ps_coscof_p2);
  y = _mm_mul_ps(y, z);
  y = _mm_mul_ps(y, z);
  __m128 tmp = _mm_mul_ps(z, *_ps_0p5);
  y = _mm_sub_ps(y, tmp);
  y = _mm_add_ps(y, *_ps_1);
  __m128 y2 = *_ps_sincof_p0;
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *_ps_sincof_p1);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_add_ps(y2, *_ps_sincof_p2);
  y2 = _mm_mul_ps(y2, z);
  y2 = _mm_mul_ps(y2, x);
  y2 = _mm_add_ps(y2, x);
  xmm3 = poly_mask;
  __m128 ysin2 = _mm_and_ps(xmm3, y2);
  __m128 ysin1 = _mm_andnot_ps(xmm3, y);
  y2 = _mm_sub_ps(y2, ysin2);
  y = _mm_sub_ps(y, ysin1);
  xmm1 = _mm_add_ps(ysin1, ysin2);
  xmm2 = _mm_add_ps(y, y2);
  *s = _mm_xor_ps(xmm1, sign_bit_sin);
  *c = _mm_xor_ps(xmm2, sign_bit_cos);
}
Beispiel #30
0
static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride , width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128 rdiv = _mm_set1_ps((float)ch->rdiv);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    __m128i matrix[25];
    for (int i = 0; i < 25; i++) {
        matrix[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);
        uint8_t *array[] = {
            p0 - 2, p0 - 1, p0, p0 + 1, p0 + 2,
            p1 - 2, p1 - 1, p1, p1 + 1, p1 + 2,
            p2 - 2, p2 - 1, p2, p2 + 1, p2 + 2,
            p3 - 2, p3 - 1, p3, p3 + 1, p3 + 2,
            p4 - 2, p4 - 1, p4, p4 + 1, p4 + 2
        };

        for (int x = 0; x < width; x += 16) {
            __m128i sum[4] = { zero, zero, zero, zero };

            for (int i = 0; i < 25; i++) {
                __m128i xmm0, xmm1, xmm2;

                xmm0 = _mm_loadu_si128((__m128i *)(array[i] + x));
                xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                xmm0 = _mm_unpacklo_epi8(xmm0, zero);
                
                xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
            }

            for (int i = 0; i < 4; i++) {
                __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                sumfp = _mm_mul_ps(sumfp, rdiv);
                sumfp = _mm_add_ps(sumfp, bias);
                if (!ch->saturate) {
                    sumfp = mm_abs_ps(sumfp);
                }
                sum[i] = _mm_cvttps_epi32(sumfp);
            }

            sum[0] = _mm_packs_epi32(sum[0], sum[1]);
            sum[1] = _mm_packs_epi32(sum[2], sum[3]);
            sum[0] = _mm_packus_epi16(sum[0], sum[1]);

            _mm_store_si128((__m128i *)(dstp + x), sum[0]);
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}