int32_t dot_product(int16_t *x,
                    int16_t *y,
                    uint32_t N, //must be a multiple of 8
                    uint8_t output_shift)
{

  uint32_t n;

#if defined(__x86_64__) || defined(__i386__)
  __m128i *x128,*y128,mmtmp1,mmtmp2,mmtmp3,mmcumul,mmcumul_re,mmcumul_im;
  __m64 mmtmp7;
  __m128i minus_i = _mm_set_epi16(-1,1,-1,1,-1,1,-1,1);
  int32_t result;

  x128 = (__m128i*) x;
  y128 = (__m128i*) y;

  mmcumul_re = _mm_setzero_si128();
  mmcumul_im = _mm_setzero_si128();

  for (n=0; n<(N>>2); n++) {

    //printf("n=%d, x128=%p, y128=%p\n",n,x128,y128);
    //    print_shorts("x",&x128[0]);
    //    print_shorts("y",&y128[0]);

    // this computes Re(z) = Re(x)*Re(y) + Im(x)*Im(y)
    mmtmp1 = _mm_madd_epi16(x128[0],y128[0]);
    //    print_ints("re",&mmtmp1);
    // mmtmp1 contains real part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp1 = _mm_srai_epi32(mmtmp1,output_shift);
    mmcumul_re = _mm_add_epi32(mmcumul_re,mmtmp1);
    //    print_ints("re",&mmcumul_re);


    // this computes Im(z) = Re(x)*Im(y) - Re(y)*Im(x)
    mmtmp2 = _mm_shufflelo_epi16(y128[0],_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_shufflehi_epi16(mmtmp2,_MM_SHUFFLE(2,3,0,1));
    //    print_shorts("y",&mmtmp2);
    mmtmp2 = _mm_sign_epi16(mmtmp2,minus_i);
    //        print_shorts("y",&mmtmp2);

    mmtmp3 = _mm_madd_epi16(x128[0],mmtmp2);
    //        print_ints("im",&mmtmp3);
    // mmtmp3 contains imag part of 4 consecutive outputs (32-bit)

    // shift and accumulate results
    mmtmp3 = _mm_srai_epi32(mmtmp3,output_shift);
    mmcumul_im = _mm_add_epi32(mmcumul_im,mmtmp3);
    //    print_ints("im",&mmcumul_im);

    x128++;
    y128++;
  }

  // this gives Re Re Im Im
  mmcumul = _mm_hadd_epi32(mmcumul_re,mmcumul_im);
  //  print_ints("cumul1",&mmcumul);

  // this gives Re Im Re Im
  mmcumul = _mm_hadd_epi32(mmcumul,mmcumul);

  //  print_ints("cumul2",&mmcumul);


  //mmcumul = _mm_srai_epi32(mmcumul,output_shift);
  // extract the lower half
  mmtmp7 = _mm_movepi64_pi64(mmcumul);
  //  print_ints("mmtmp7",&mmtmp7);
  // pack the result
  mmtmp7 = _mm_packs_pi32(mmtmp7,mmtmp7);
  //  print_shorts("mmtmp7",&mmtmp7);
  // convert back to integer
  result = _mm_cvtsi64_si32(mmtmp7);

  _mm_empty();
  _m_empty();

  return(result);

#elif defined(__arm__)
  int16x4_t *x_128=(int16x4_t*)x;
  int16x4_t *y_128=(int16x4_t*)y;
  int32x4_t tmp_re,tmp_im;
  int32x4_t tmp_re1,tmp_im1;
  int32x4_t re_cumul,im_cumul;
  int32x2_t re_cumul2,im_cumul2;
  int32x4_t shift = vdupq_n_s32(-output_shift); 
  int32x2x2_t result2;
  int16_t conjug[4]__attribute__((aligned(16))) = {-1,1,-1,1} ;

  re_cumul = vdupq_n_s32(0);
  im_cumul = vdupq_n_s32(0); 

  for (n=0; n<(N>>2); n++) {

    tmp_re  = vmull_s16(*x_128++, *y_128++);
    //tmp_re = [Re(x[0])Re(y[0]) Im(x[0])Im(y[0]) Re(x[1])Re(y[1]) Im(x[1])Im(y[1])] 
    tmp_re1 = vmull_s16(*x_128++, *y_128++);
    //tmp_re1 = [Re(x1[1])Re(x2[1]) Im(x1[1])Im(x2[1]) Re(x1[1])Re(x2[2]) Im(x1[1])Im(x2[2])] 
    tmp_re  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_re),vget_high_s32(tmp_re)),
                           vpadd_s32(vget_low_s32(tmp_re1),vget_high_s32(tmp_re1)));
    //tmp_re = [Re(ch[0])Re(rx[0])+Im(ch[0])Im(ch[0]) Re(ch[1])Re(rx[1])+Im(ch[1])Im(ch[1]) Re(ch[2])Re(rx[2])+Im(ch[2]) Im(ch[2]) Re(ch[3])Re(rx[3])+Im(ch[3])Im(ch[3])] 

    tmp_im  = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im = [-Im(ch[0])Re(rx[0]) Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1]) Re(ch[1])Im(rx[1])]
    tmp_im1 = vmull_s16(vrev32_s16(vmul_s16(*x_128++,*(int16x4_t*)conjug)),*y_128++);
    //tmp_im1 = [-Im(ch[2])Re(rx[2]) Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3]) Re(ch[3])Im(rx[3])]
    tmp_im  = vcombine_s32(vpadd_s32(vget_low_s32(tmp_im),vget_high_s32(tmp_im)),
                           vpadd_s32(vget_low_s32(tmp_im1),vget_high_s32(tmp_im1)));
    //tmp_im = [-Im(ch[0])Re(rx[0])+Re(ch[0])Im(rx[0]) -Im(ch[1])Re(rx[1])+Re(ch[1])Im(rx[1]) -Im(ch[2])Re(rx[2])+Re(ch[2])Im(rx[2]) -Im(ch[3])Re(rx[3])+Re(ch[3])Im(rx[3])]

    re_cumul = vqaddq_s32(re_cumul,vqshlq_s32(tmp_re,shift));
    im_cumul = vqaddq_s32(im_cumul,vqshlq_s32(tmp_im,shift));
  }
  
  re_cumul2 = vpadd_s32(vget_low_s32(re_cumul),vget_high_s32(re_cumul));
  im_cumul2 = vpadd_s32(vget_low_s32(im_cumul),vget_high_s32(im_cumul));
  re_cumul2 = vpadd_s32(re_cumul2,re_cumul2);
  im_cumul2 = vpadd_s32(im_cumul2,im_cumul2);
  result2   = vzip_s32(re_cumul2,im_cumul2);
  return(vget_lane_s32(result2.val[0],0));
#endif
}
 __m64 interpolvline_1(	unsigned char* image,	int PicWidthInPix){
	
	 
	 
	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;

	__m64 ret;

	xmm7 = _mm_setzero_si128();

	xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix)));
	xmm0 = _mm_unpacklo_epi8(xmm0,xmm7);
	xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix)));
	xmm1 = _mm_unpacklo_epi8(xmm1,xmm7);
	xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix)));
	xmm2 = _mm_unpacklo_epi8(xmm2,xmm7);
	xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix)));
	xmm3 = _mm_unpacklo_epi8(xmm3,xmm7);
	xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix)));
	xmm4 = _mm_unpacklo_epi8(xmm4,xmm7);
	xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix)));
	xmm5 = _mm_unpacklo_epi8(xmm5,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);
	xmm6 = _mm_slli_epi16(xmm6,2);
	xmm6 = _mm_sub_epi16(xmm6,xmm1);
	xmm6 = _mm_sub_epi16(xmm6,xmm4);

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);
	xmm6 = _mm_add_epi16(xmm6,xmm0);
	xmm6 = _mm_add_epi16(xmm6,xmm5);
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values
	xmm6 = _mm_srli_epi16(xmm6,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm7);
	xmm3 = _mm_packus_epi16(xmm3,xmm7);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	xmm5 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm5);

	xmm6 = _mm_slli_epi16(xmm6,8);
	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);
	
	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
 __m64 interpolhline_1(unsigned char* image){

	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
	unsigned char* imagetmp = image - 2;
	__m64 ret;

	xmm7 = _mm_setzero_si128();
	xmm6 = _mm_loadu_si128(((__m128i*)imagetmp));


	xmm0 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm1 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm2 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm3 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm5 = _mm_unpacklo_epi8(xmm6,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);//(C + D)
	xmm6 = _mm_slli_epi16(xmm6,2);//(C + D) << 2
	xmm6 = _mm_sub_epi16(xmm6,xmm1);//((C + D) << 2) - B
	xmm6 = _mm_sub_epi16(xmm6,xmm4);//((C + D) << 2) - B - E

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);//(((C + D) << 2) - B - E) * 5
	xmm6 = _mm_add_epi16(xmm6,xmm0);//((((C + D) << 2) - B - E) * 5) + A
	xmm6 = _mm_add_epi16(xmm6,xmm5);//((((C + D) << 2) - B - E) * 5) + A + F
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));//((((C + D) << 2) - B - E) * 5) + A + F + 16
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values Clip255_16
	xmm6 = _mm_srli_epi16(xmm6,5); // result0 >> 5

	xmm2 = _mm_packus_epi16(xmm2,xmm7);
	xmm3 = _mm_packus_epi16(xmm3,xmm7);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	xmm5 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm5);

	xmm6 = _mm_slli_epi16(xmm6,8);
	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
void luma_sample_interp_8x8_3_3_SSE2(unsigned char* sortie, unsigned char* image,const short PicWidthInPix,const short OutStride){


	unsigned char* tmpimg = image + 1;
	unsigned char* tmpimg2 = image + PicWidthInPix;
	__m64 *ptr_out = (__m64 *)sortie;

	int i;
	for(i = 0; i < 8 * PicWidthInPix; i += PicWidthInPix, ptr_out += (OutStride >> 3))
	{
		__m128i verthalf = _mm_movpi64_epi64(interpolvline_2(tmpimg + i,PicWidthInPix));
		__m128i horizhalf = _mm_movpi64_epi64(interpolhline_2(tmpimg2 + i));
		*ptr_out = _mm_movepi64_pi64(_mm_avg_epu8(verthalf,horizhalf));
	}

	_mm_empty(); 
}
 __m64 interpolhline_2(unsigned char* image){

	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;
	unsigned char* imagetmp = image - 2;
	__m64 ret;

	xmm7 = _mm_setzero_si128();
	xmm6 =  _mm_loadu_si128(((__m128i*)imagetmp));


	xmm0 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm1 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm2 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm3 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm7);
	xmm6 = _mm_srli_si128(xmm6,1);
	xmm5 = _mm_unpacklo_epi8(xmm6,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);
	xmm6 = _mm_slli_epi16(xmm6,2);
	xmm6 = _mm_sub_epi16(xmm6,xmm1);
	xmm6 = _mm_sub_epi16(xmm6,xmm4);

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);
	xmm6 = _mm_add_epi16(xmm6,xmm0);
	xmm6 = _mm_add_epi16(xmm6,xmm5);
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values
	xmm6 = _mm_srli_epi16(xmm6,5);

	xmm6 = _mm_packus_epi16(xmm6,xmm7);
	
	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
void luma_sample_interp_8x8_3_2_SSE2(unsigned char* sortie, unsigned char* image,const short PicWidthInPix,const short OutStride){
	
	__m64 *ptr_out = (__m64 *)sortie;
	unsigned char* tmpimg = image + 1;
	__m128i temp[13];
	int i;
	for(i = 0; i < 13;i++)	{
		_mm_store_si128(temp + i, interpolhline128(image + PicWidthInPix * (i - 2)));
	}

	for(i = 0; i < 8;i++, ptr_out += (OutStride >> 3))
	{

		__m128i verthalf = _mm_movpi64_epi64(interpolvline_2(tmpimg + i*PicWidthInPix,PicWidthInPix));
		__m128i centerpix = _mm_movpi64_epi64(interpolvline128_2(temp+i));

		*ptr_out = _mm_movepi64_pi64(_mm_avg_epu8(verthalf,centerpix));
	}

	_mm_empty(); 
}
 __m64 interpolvline128_3(__m128i* temp){



	__m128i xmm6;

	__m64 ret;

	__m128i xmm7 = _mm_setzero_si128();

	__m128i xmm0 = _mm_load_si128(temp++);
	__m128i xmm1 = _mm_load_si128(temp++);
	__m128i xmm2 = _mm_load_si128(temp++);
	__m128i xmm3 = _mm_load_si128(temp++);
	__m128i xmm4 = _mm_load_si128(temp++);
	__m128i xmm5 = _mm_load_si128(temp);

	xmm1 = _mm_add_epi16(xmm1,xmm4);
	xmm0 = _mm_add_epi16(xmm0,xmm5);

	xmm6 = _mm_set_epi32(0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB,0xFFFBFFFB);

	xmm4 = _mm_mullo_epi16(xmm1, xmm6);
	xmm5 = _mm_mulhi_epi16(xmm1, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm6 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_set_epi32(0x00140014,0x00140014,0x00140014,0x00140014);
	xmm5 = _mm_add_epi16(xmm2,xmm3);

	xmm4 = _mm_mullo_epi16(xmm5, xmm7);
	xmm5 = _mm_mulhi_epi16(xmm5, xmm7);

	xmm7 = _mm_unpacklo_epi16(xmm4, xmm5);
	xmm4 = _mm_unpackhi_epi16(xmm4, xmm5);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 

	xmm6 = _mm_set_epi32(0x00010001,0x00010001,0x00010001,0x00010001);
	xmm6 = _mm_mulhi_epi16(xmm0, xmm6);

	xmm1 = _mm_unpacklo_epi16(xmm0, xmm6);
	xmm6 = _mm_unpackhi_epi16(xmm0, xmm6);

	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm6); 
	
	xmm1 = _mm_set_epi32(0x00000200,0x00000200,0x00000200,0x00000200);
	
	xmm7 = _mm_add_epi32(xmm7,xmm1);
	xmm4 = _mm_add_epi32(xmm4,xmm1);
	
	xmm5 = _mm_setzero_si128();

	xmm7 = _mm_srli_epi32(xmm7, 10);
	xmm7 = _mm_max_epi16(xmm7, xmm5); // preventing negative values
	xmm7 = _mm_slli_epi32(xmm7,16);
	xmm7 = _mm_srli_epi32(xmm7,16);

	xmm4 = _mm_srli_epi32(xmm4, 10);

	xmm4 = _mm_max_epi16(xmm4, xmm5); // preventing negative values
	xmm4 = _mm_slli_epi32(xmm4,16);
	xmm4 = _mm_srli_epi32(xmm4,16);

	xmm6 = _mm_packs_epi32(xmm7, xmm4);
	
	xmm1 = _mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010);
	xmm2 = _mm_add_epi16(xmm2,xmm1);
	xmm2 = _mm_max_epi16(xmm2, xmm5); // preventing negative values
	xmm2 = _mm_srli_epi16(xmm2,5);

	
	xmm3 = _mm_add_epi16(xmm3,xmm1);
	xmm3 = _mm_max_epi16(xmm3, xmm5); // preventing negative values
	xmm3 = _mm_srli_epi16(xmm3,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm5);
	xmm3 = _mm_packus_epi16(xmm3,xmm5);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);

	xmm7 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm7);

	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm5);
	ret = _mm_movepi64_pi64(xmm6);

	_mm_empty(); 

	return(ret);
}