void luma_sample_interp_8x8_3_3_SSE2(unsigned char* sortie, unsigned char* image,const short PicWidthInPix,const short OutStride){


	unsigned char* tmpimg = image + 1;
	unsigned char* tmpimg2 = image + PicWidthInPix;
	__m64 *ptr_out = (__m64 *)sortie;

	int i;
	for(i = 0; i < 8 * PicWidthInPix; i += PicWidthInPix, ptr_out += (OutStride >> 3))
	{
		__m128i verthalf = _mm_movpi64_epi64(interpolvline_2(tmpimg + i,PicWidthInPix));
		__m128i horizhalf = _mm_movpi64_epi64(interpolhline_2(tmpimg2 + i));
		*ptr_out = _mm_movepi64_pi64(_mm_avg_epu8(verthalf,horizhalf));
	}

	_mm_empty(); 
}
void luma_sample_interp_8x8_3_2_SSE2(unsigned char* sortie, unsigned char* image,const short PicWidthInPix,const short OutStride){
	
	__m64 *ptr_out = (__m64 *)sortie;
	unsigned char* tmpimg = image + 1;
	__m128i temp[13];
	int i;
	for(i = 0; i < 13;i++)	{
		_mm_store_si128(temp + i, interpolhline128(image + PicWidthInPix * (i - 2)));
	}

	for(i = 0; i < 8;i++, ptr_out += (OutStride >> 3))
	{

		__m128i verthalf = _mm_movpi64_epi64(interpolvline_2(tmpimg + i*PicWidthInPix,PicWidthInPix));
		__m128i centerpix = _mm_movpi64_epi64(interpolvline128_2(temp+i));

		*ptr_out = _mm_movepi64_pi64(_mm_avg_epu8(verthalf,centerpix));
	}

	_mm_empty(); 
}
 __m64 interpolvline_1(	unsigned char* image,	int PicWidthInPix){
	
	 
	 
	__m128i xmm0,xmm1,xmm2,xmm3,xmm4,xmm5,xmm6,xmm7;

	__m64 ret;

	xmm7 = _mm_setzero_si128();

	xmm0 = _mm_movpi64_epi64(*((__m64*)(image - 2*PicWidthInPix)));
	xmm0 = _mm_unpacklo_epi8(xmm0,xmm7);
	xmm1 = _mm_movpi64_epi64(*((__m64*)(image - 1*PicWidthInPix)));
	xmm1 = _mm_unpacklo_epi8(xmm1,xmm7);
	xmm2 = _mm_movpi64_epi64(*((__m64*)(image - 0*PicWidthInPix)));
	xmm2 = _mm_unpacklo_epi8(xmm2,xmm7);
	xmm3 = _mm_movpi64_epi64(*((__m64*)(image + 1*PicWidthInPix)));
	xmm3 = _mm_unpacklo_epi8(xmm3,xmm7);
	xmm4 = _mm_movpi64_epi64(*((__m64*)(image + 2*PicWidthInPix)));
	xmm4 = _mm_unpacklo_epi8(xmm4,xmm7);
	xmm5 = _mm_movpi64_epi64(*((__m64*)(image + 3*PicWidthInPix)));
	xmm5 = _mm_unpacklo_epi8(xmm5,xmm7);

// filter on 8 values
	xmm6 = _mm_add_epi16(xmm2,xmm3);
	xmm6 = _mm_slli_epi16(xmm6,2);
	xmm6 = _mm_sub_epi16(xmm6,xmm1);
	xmm6 = _mm_sub_epi16(xmm6,xmm4);

	xmm1 = _mm_set_epi32(0x00050005,0x00050005,0x00050005,0x00050005);
	xmm6 = _mm_mullo_epi16(xmm6,xmm1);
	xmm6 = _mm_add_epi16(xmm6,xmm0);
	xmm6 = _mm_add_epi16(xmm6,xmm5);
	xmm6 = _mm_add_epi16(xmm6,_mm_set_epi32(0x00100010,0x00100010,0x00100010,0x00100010));
	xmm6 = _mm_max_epi16(xmm6, xmm7); // preventing negative values
	xmm6 = _mm_srli_epi16(xmm6,5);

	xmm2 = _mm_packus_epi16(xmm2,xmm7);
	xmm3 = _mm_packus_epi16(xmm3,xmm7);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);

	xmm5 = _mm_unpacklo_epi8(xmm2,xmm6);
	xmm4 = _mm_unpacklo_epi8(xmm6,xmm3);
	xmm6 = _mm_avg_epu8(xmm4,xmm5);

	xmm6 = _mm_slli_epi16(xmm6,8);
	xmm6 = _mm_srli_epi16(xmm6,8);
	xmm6 = _mm_packus_epi16(xmm6,xmm7);
	
	ret = _mm_movepi64_pi64(xmm6);
	_mm_empty(); 

	return(ret);
}
Exemplo n.º 4
0
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix)
{
   const unsigned char
      *u = src + ((y-1) & 7)*sc2lines_width,
      *m = src + ((y+0) & 7)*sc2lines_width,
      *l = src + ((y+1) & 7)*sc2lines_width;

   for (unsigned i = 0; i < nPix; i += 8) {

      __m64 uu = *(__m64*)(u+i);
      __m64 ll = *(__m64*)(l+i);
      __m64 cmp = _mm_cmpeq_pi8(uu,ll);

      if (_mm_movemask_pi8(cmp) != 0xFF) {

         __m128i mm = _mm_loadu_si128((__m128i*)(m+i-4));
         __m128i uu = _mm_loadu_si128((__m128i*)(u+i-4));
         __m128i ll = _mm_loadu_si128((__m128i*)(l+i-4));

         __m128i md = _mm_slli_si128(mm,1);
         __m128i mf = _mm_srli_si128(mm,1);
         __m128i maskall = _mm_or_si128(_mm_cmpeq_epi8(md,mf), _mm_cmpeq_epi8(uu,ll));

         __m128i e0, e1, v1, v2, v3;

         e0 = _mm_cmpeq_epi8(md,uu);
         e0 = _mm_andnot_si128(maskall, e0);
         e0 = _mm_srli_si128(e0,4);
         e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());

         e1 = _mm_cmpeq_epi8(mf,uu);
         e1 = _mm_andnot_si128(maskall, e1);
         e1 = _mm_srli_si128(e1,4);
         e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);

         e0 = _mm_or_si128(e0, e1);

         v1 = _mm_srli_si128(mm,4);
         v1 = _mm_unpacklo_epi8(v1,v1);
         v2 = _mm_srli_si128(uu,4);
         v2 = _mm_unpacklo_epi8(v2,v2);

         _mm_store_si128((__m128i*)(dst1 + 2*i), _mm_or_si128( _mm_and_si128(e0,v2), _mm_andnot_si128(e0,v1) ) );

         e0 = _mm_cmpeq_epi8(md,ll);
         e0 = _mm_andnot_si128(maskall, e0);
         e0 = _mm_srli_si128(e0,4);
         e0 = _mm_unpacklo_epi8(e0, _mm_setzero_si128());

         e1 = _mm_cmpeq_epi8(mf,ll);
         e1 = _mm_andnot_si128(maskall, e1);
         e1 = _mm_srli_si128(e1,4);
         e1 = _mm_unpacklo_epi8(_mm_setzero_si128(), e1);

         e0 = _mm_or_si128(e0, e1);

         v3 = _mm_srli_si128(ll,4);
         v3 = _mm_unpacklo_epi8(v3,v3);

         _mm_store_si128((__m128i*)(dst2 + 2*i), _mm_or_si128( _mm_and_si128(e0,v3), _mm_andnot_si128(e0,v1) ) );

      } else {

         __m64 v0 = *(__m64*)(m+i);
         __m128i v1 = _mm_movpi64_epi64(v0);
         v1 = _mm_unpacklo_epi8(v1,v1);
         _mm_store_si128((__m128i*)(dst1 + 2*i), v1);
         _mm_store_si128((__m128i*)(dst2 + 2*i), v1);
      }
   }
}