Ejemplo n.º 1
0
void pix_offset :: processRGBAMMX(imageStruct &image)
{
  char  R = m_offset[chRed];
  char  G = m_offset[chGreen];
  char  B = m_offset[chBlue];
  char  A = m_offset[chAlpha];

  register int pixsize = (image.ysize * image.xsize)>>1;

  register __m64 offset_64 = _mm_setr_pi8(R, G, B, A, R, G, B, A);
  register __m64*data_p= (__m64*)image.data;
  _mm_empty();

  if(m_saturate) {
    while(pixsize--) {
      data_p[0]=_mm_adds_pu8(data_p[0], offset_64);
      data_p++;
    }
  } else {
    while(pixsize--) {
      data_p[0]=_mm_add_pi8(data_p[0], offset_64);
      data_p++;
    }
  }
  _mm_empty();
}
Ejemplo n.º 2
0
unsigned int mmx_hash_bucket_data(unsigned char *key, int size, int NoOfItems)
{
		char *p, *end;
    __m64 v1, v2, s; 
		int val;

		if (size < 8) return(fnv_data2bucket(key, size, NoOfItems));

		p=key;
		end=key+size;
    _mm_empty();                            // emms
		v1=_mm_set1_pi32(FNV_INIT_VAL);

		while ((end-p) > 7)
		{
		v2=_mm_setr_pi32(*p,*(p+4));
		v1=_mm_add_pi16(v1, v2);
		v1=_mm_slli_pi32(v1, 3);
		p+=8;
		}

		val=_mm_cvtsi64_si32(v1);
    _mm_empty();                            // emms

		if (val < 0) val=1-val;
 		val =val % NoOfItems;
		return(val);
}
Ejemplo n.º 3
0
void OL_BlendImage::BlendImageMask(Image444* base, Image444* overlay, Image444* mask) {
  BYTE* baseY = base->GetPtr(PLANAR_Y);
  BYTE* baseU = base->GetPtr(PLANAR_U);
  BYTE* baseV = base->GetPtr(PLANAR_V);

  BYTE* ovY = overlay->GetPtr(PLANAR_Y);
  BYTE* ovU = overlay->GetPtr(PLANAR_U);
  BYTE* ovV = overlay->GetPtr(PLANAR_V);
  
  BYTE* maskY = mask->GetPtr(PLANAR_Y);
  BYTE* maskU = mask->GetPtr(PLANAR_U);
  BYTE* maskV = mask->GetPtr(PLANAR_V);

  int w = base->w();
  int h = base->h();

  if (opacity == 256) {
    if (env->GetCPUFlags() & CPUF_SSE2) {
      overlay_blend_sse2_plane_masked(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h);
      overlay_blend_sse2_plane_masked(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h);
      overlay_blend_sse2_plane_masked(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h);
    } else
#ifdef X86_32
    if (env->GetCPUFlags() & CPUF_MMX) {
      overlay_blend_mmx_plane_masked(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h);
      overlay_blend_mmx_plane_masked(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h);
      overlay_blend_mmx_plane_masked(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h);
      _mm_empty();
    } else
#endif
    {
      overlay_blend_c_plane_masked(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h);
      overlay_blend_c_plane_masked(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h);
      overlay_blend_c_plane_masked(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h);
    }
  } else {
    if (env->GetCPUFlags() & CPUF_SSE2) {
      overlay_blend_sse2_plane_masked_opacity(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      overlay_blend_sse2_plane_masked_opacity(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      overlay_blend_sse2_plane_masked_opacity(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
    } else
#ifdef X86_32
    if (env->GetCPUFlags() & CPUF_MMX) {
      overlay_blend_mmx_plane_masked_opacity(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      overlay_blend_mmx_plane_masked_opacity(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      overlay_blend_mmx_plane_masked_opacity(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      _mm_empty();
    } else
#endif
    {
      overlay_blend_c_plane_masked_opacity(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      overlay_blend_c_plane_masked_opacity(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
      overlay_blend_c_plane_masked_opacity(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h, opacity);
    }
  }
}
Ejemplo n.º 4
0
extern "C" void __cdecl memcpyMMX(void *Dest, void *Src, size_t nBytes)
{
	_mm_empty();

    __asm 
    {
        mov     esi, dword ptr[Src]
        mov     edi, dword ptr[Dest]
        mov     ecx, nBytes
        shr     ecx, 6                      // nBytes / 64
		cmp     ecx, 0
		je      Myloop 
align 8
CopyLoop:
        movq    mm0, qword ptr[esi]
        movq    mm1, qword ptr[esi+8*1]
        movq    mm2, qword ptr[esi+8*2]
        movq    mm3, qword ptr[esi+8*3]
        movq    mm4, qword ptr[esi+8*4]
        movq    mm5, qword ptr[esi+8*5]
        movq    mm6, qword ptr[esi+8*6]
        movq    mm7, qword ptr[esi+8*7]
        movq    qword ptr[edi], mm0
        movq    qword ptr[edi+8*1], mm1
        movq    qword ptr[edi+8*2], mm2
        movq    qword ptr[edi+8*3], mm3
        movq    qword ptr[edi+8*4], mm4
        movq    qword ptr[edi+8*5], mm5
        movq    qword ptr[edi+8*6], mm6
        movq    qword ptr[edi+8*7], mm7
        add     esi, 64
        add     edi, 64
        loop CopyLoop
//		emms
align 8
Myloop:
        mov     ecx, nBytes
        and     ecx, 63
        cmp     ecx, 0
        je EndCopyLoop
;align 8
;CopyLoop2:
        mov dl, byte ptr[esi] 
        mov byte ptr[edi], dl
;        inc esi
;        inc edi
;        dec ecx
;        jne CopyLoop2
		 rep movsb
EndCopyLoop:
    }

	_mm_empty();
}
Ejemplo n.º 5
0
void pix_offset :: processYUVMMX(imageStruct &image)
{
  register int pixsize = (image.ysize * image.xsize)>>2;

  register __m64 offset_64 = _mm_setr_pi8(U, Y, V, Y, U, Y, V, Y);
  register __m64*data_p= (__m64*)image.data;
  _mm_empty();

  while(pixsize--) {
    data_p[0]=_mm_add_pi8(data_p[0], offset_64);
    data_p++;
  }
  _mm_empty();
}
Ejemplo n.º 6
0
void pix_multiply :: processRGBA_MMX(imageStruct &image, imageStruct &right)
{
    int datasize =   image.xsize * image.ysize * image.csize;
    __m64*leftPix =  (__m64*)image.data;
    __m64*rightPix = (__m64*)right.data;

    datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);

    __m64 l0, r0, l1, r1;
    __m64 null64 = _mm_setzero_si64();
    while(datasize--)    {
        l1=leftPix [datasize];
        r1=rightPix[datasize];

        l0=_mm_unpacklo_pi8(l1, null64);
        r0=_mm_unpacklo_pi8(r1, null64);
        l1=_mm_unpackhi_pi8(l1, null64);
        r1=_mm_unpackhi_pi8(r1, null64);

        l0=_mm_mullo_pi16  (l0, r0);
        l1=_mm_mullo_pi16  (l1, r1);

        l0=_mm_srli_pi16(l0, 8);
        l1=_mm_srli_pi16(l1, 8);

        leftPix[datasize]=_mm_packs_pu16(l0, l1);
    }
    _mm_empty();
}
Ejemplo n.º 7
0
static long
conv_rgba16_rgbaF (const uint16_t *src, float *dst, long samples)
{
  long i = 0;

  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
    {
      long           n  = (samples / 2) * 2;
      const __m128i *s  = (const __m128i*) src;
            __v4sf  *d  = (__v4sf*) dst;

      for (; i < n / 2; i++)
        {
          /* Expand shorts to ints by loading zero in the high bits */
          const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());
          const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps());

          /* Convert to float */
          const __m128  u0 = _mm_cvtepi32_ps (t0);
          const __m128  u1 = _mm_cvtepi32_ps (t1);

          const __v4sf rgba0 = u0 * u16_float;
          const __v4sf rgba1 = u1 * u16_float;

          d[2 * i + 0] = rgba0;
          d[2 * i + 1] = rgba1;
        }
      _mm_empty();
    }

  for (i *= 2 * 4; i != 4 * samples; i++)
    dst[i] = src[i] * (1.f / 65535);

  return samples;
}
Ejemplo n.º 8
0
/* use compiler intrinsics for 4x parallel processing */
static inline float chi2_intrinsic_aligned_float(int n, const float* x, const float* y) {
    float result=0;
    const __m128 eps = _mm_set1_ps(FLT_MIN);
    const __m128 zero = _mm_setzero_ps();
    __m128 chi2 = _mm_setzero_ps();
    
    for (; n>3; n-=4) {
        const __m128 a = _mm_loadu_ps(x);
        const __m128 b = _mm_loadu_ps(y);
        const __m128 a_plus_eps = _mm_add_ps(a,eps);
        const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b);
        const __m128 a_minus_b = _mm_sub_ps(a,b);
        const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b);
        const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_ps(chi2, prod);
	x+=4;
	y+=4;
    }
    const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2));
    const __m128 sum1 = _mm_add_ps(chi2, shuffle1);
    const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1));
    const __m128 sum2 = _mm_add_ps(sum1, shuffle2);
// with SSE3, we could use hadd_ps, but the difference is negligible 

    _mm_store_ss(&result,sum2);
    _mm_empty();
    
    if (n)
        result += chi2_baseline_float(n, x, y);	// remaining 1-3 entries
    return result;
}
Ejemplo n.º 9
0
void Haar::transcols(char** dest, char** sour, unsigned int w, unsigned int h) const
{
        unsigned int h2 = h / 2;

        for (unsigned int k = 0; k < h2; k++) {

                __m64 *mlo = (__m64 *) & dest[k][0];
                __m64 *mhi = (__m64 *) & dest[k+h2][0];
                __m64 *even = (__m64 *) & sour[2*k][0];
                __m64 *odd = (__m64 *) & sour[2*k+1][0];

                for (unsigned int x = 0; x < w / 8; x++) {
                        
                        addsub(*even, *odd, mlo, mhi);                        

                        even++;
                        odd++;
                        mlo++;
                        mhi++;
                }
        }
        _mm_empty();

        //odd remainder
        for (unsigned int x = w - (w % 8); x < w; x++) {
                for (unsigned int k = 0; k < h2; k++) {                        
                        dest[k][x] = char(((int)sour[2*k][x] + (int)sour[2*k+1][x]) / 2);                        
                        dest[k+h2][x] = char(((int)sour[2*k][x] - (int)sour[2*k+1][x]) / 2);
                }
        }
}
Ejemplo n.º 10
0
long dotp(short a[], short b[])
{
  int i;
  __m64 mm0, mm1, mm2, mm3, mm4;
  short suml[4];   // don't init sum from C - this confuses the GCC!
  short sumh[4];
  
  /* mmx - Intel Pentium-MMX and above */

  mm2 = _m_psubw(mm2, mm2);   // set mm2 to 0
  mm4 = _m_psubw(mm4, mm4);
  for (i = 0; i < NLMS_LEN; i += 4, a += 4, b += 4) {
    mm0 = _m_from_WORDs(a);
    mm3 = mm0;
    mm1 = _m_from_WORDs(b);
    
    /* Intel notation: first operand is destination */
    /* GNU as notation: first operand is source */
    // mm0 = _mm_mullo_pi16 (mm0, mm1);
    mm3 = _mm_mulhi_pi16 (mm3, mm1);
    // mm2 = _mm_add_pi16(mm2, mm0);
    mm4 = _mm_add_pi16(mm4, mm3);
  }
  _m_from_WORDs(suml) = mm2;
  _m_from_WORDs(sumh) = mm4;
  _mm_empty();
  return suml[0] + suml[1] + suml[2] + suml[3] 
   + 65536 * (sumh[0] + sumh[1] + sumh[2] + sumh[3]);
}
Ejemplo n.º 11
0
void multadd_complex_vector_real_scalar(int16_t *x,
                                        int16_t alpha,
                                        int16_t *y,
                                        uint8_t zero_flag,
                                        uint32_t N)
{

  simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y;
  int n;

  alpha_128 = set1_int16(alpha);

  if (zero_flag == 1)
    for (n=0; n<N>>2; n++) {
      y_128[n] = mulhi_int16(x_128[n],alpha_128);
    }

  else
    for (n=0; n<N>>2; n++) {
      y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128));
    }

  _mm_empty();
  _m_empty();

}
Ejemplo n.º 12
0
int complex_conjugate(int16_t *x1,
                      int16_t *y,
                      uint32_t N)

{
  uint32_t i;                 // loop counter

  simd_q15_t *x1_128;
  simd_q15_t *y_128;
  int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; 
  simd_q15_t *x2_128 = (simd_q15_t*)&x2[0];
  x1_128 = (simd_q15_t *)&x1[0];
  y_128 = (simd_q15_t *)&y[0];


  // we compute 4 cpx multiply for each loop
  for(i=0; i<(N>>3); i++) {
    y_128[0] = mullo_int16(x1_128[0],*x2_128);
    y_128[1] = mullo_int16(x1_128[1],*x2_128);
    y_128[2] = mullo_int16(x1_128[2],*x2_128);
    y_128[3] = mullo_int16(x1_128[3],*x2_128);


    x1_128+=4;
    y_128 +=4;
  }


  _mm_empty();
  _m_empty();

  return(0);
}
void GOST34112012Update_sse41(GOST34112012Context* ctx, const unsigned char* data, std::size_t len)
{
	std::size_t chunksize;
	const union uint512_u* d = reinterpret_cast<const union uint512_u*>(data);

	while (len > 63 && ctx->bufsize == 0) {
		stage2(ctx, d);

		data += 64;
		len  -= 64;
	}

	while (len) {
		chunksize = 64 - ctx->bufsize;
		if (chunksize > len) {
			chunksize = len;
		}

		std::memcpy(&ctx->buffer.BYTE[ctx->bufsize], data, chunksize);

		ctx->bufsize += chunksize;
		len          -= chunksize;
		data         += chunksize;

		if (ctx->bufsize == 64) {
			stage2(ctx, &ctx->buffer);
			ctx->bufsize = 0;
		}
	}

	_mm_empty();
}
Ejemplo n.º 14
0
HRESULT CBaseVideoFilter::Receive(IMediaSample* pIn)
{
#ifndef _WIN64
    // TODOX64 : fixme!
    _mm_empty(); // just for safety
#endif

    CAutoLock cAutoLock(&m_csReceive);

    HRESULT hr;

    AM_SAMPLE2_PROPERTIES* const pProps = m_pInput->SampleProps();
    if (pProps->dwStreamId != AM_STREAM_MEDIA) {
        return m_pOutput->Deliver(pIn);
    }

    AM_MEDIA_TYPE* pmt;
    if (SUCCEEDED(pIn->GetMediaType(&pmt)) && pmt) {
        CMediaType mt(*pmt);
        m_pInput->SetMediaType(&mt);
        DeleteMediaType(pmt);
    }

    if (FAILED(hr = Transform(pIn))) {
        return hr;
    }

    return S_OK;
}
Ejemplo n.º 15
0
/* Combines unpack and accumulate */
void vector_accumulate_8bit(float *out, const char *in, int n) {
#ifdef FOLD_USE_INTRINSICS
    __m128 in_, out_, tmp_;
    float ftmp;
    int ii;
    for (ii = 0 ; ii < (n & -16) ; ii += 16) {
        __builtin_prefetch(out + 64, 1, 0);
        __builtin_prefetch(in  + 64, 0, 0);

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;

        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < (n & -4) ; ii += 4) {
        out_ = _MM_LOAD_PS(out);
        in_ = _mm_cvtpi8_ps(*((__m64 *)in));
        tmp_ = _mm_add_ps(out_, in_);
        _MM_STORE_PS(out, tmp_);
        in  += 4;
        out += 4;
    }
    for (; ii < n ; ii++) {  // Cast these without intrinsics
        ftmp = (float)(*in);
        out_ = _mm_load_ss(out);
        in_ = _mm_load_ss(&ftmp);
        tmp_ = _mm_add_ss(out_, in_);
        _mm_store_ss(out, tmp_);
        in  += 1;
        out += 1;
    }
    _mm_empty();
#else
    int i;
    for (i=0; i<n; i++) { out[i] += (float)in[i]; }
#endif
}
Ejemplo n.º 16
0
///////////////////////////////////////////////transforms/////////////////////////////////////////////////////////////////////
void Haar::transrows(char** dest, char** sour, unsigned int w, unsigned int h) const
{
        unsigned int w2 = w / 2;

        __m64 m00FF;
        m00FF.m64_u64 = 0x00FF00FF00FF00FF;

        for (unsigned int y = 0; y < h; y++) {

                __m64 *mlo = (__m64 *) & dest[y][0];
                __m64 *mhi = (__m64 *) & dest[y][w2];
                __m64 *msour = (__m64 *) & sour[y][0];

                for (unsigned int k = 0; k < w2 / 8; k++) {   //k<w2/8   k=8*k

                        __m64 even = _mm_packs_pu16(_mm_and_si64(*msour, m00FF), _mm_and_si64(*(msour + 1), m00FF));       //even coeffs
                        __m64 odd = _mm_packs_pu16(_mm_srli_pi16(*msour, 8), _mm_srli_pi16(*(msour + 1), 8));              //odd coeffs

                        addsub(even, odd, mlo++, mhi++);
                        msour += 2;
                }

                if (w2 % 8) {
                        for (unsigned int k = w2 - (w2 % 8); k < w2; k++) {
                                dest[y][k] = char(((int)sour[y][2*k] + (int)sour[y][2*k+1]) / 2);
                                dest[y][k+w2] = char(((int)sour[y][2*k] - (int)sour[y][2*k+1]) / 2);
                        }
                }
        }
        _mm_empty();
}
Ejemplo n.º 17
0
STDMETHODIMP TffdshowEnc::deliverEncodedSample(const TmediaSample &sample, TencFrameParams &params)
{
    _mm_empty();

    params.outlength = coSettings->storeAVI ? params.length : 0;

    if (mux) {
        mux->writeFrame(sample, params.length, params);
    }

    if (params.keyframe) {
        keyspacing = 0;
    }

    totalsize += params.length;

    if (outputdebug || outputdebugfile) {
        dbgWrite(_l("1st-pass: size:%d total-kbytes:%d %s quant:%d %s kblocks:%d mblocks:%d\n"), params.length, int(totalsize / 1024), FRAME_TYPE::name(params.frametype), params.quant, encQuantTypes[params.quanttype], params.kblks, params.mblks);
    }

    params.framenum++;
    keyspacing++;

    encStats.add(params);
    if (h_graph) {
        PostMessage(h_graph, TencStats::MSG_FF_FRAME, params.length, (params.quant << 20) | params.frametype);
    }

    return ICERR_OK;
}
Ejemplo n.º 18
0
/* use compiler intrinsics for 2x parallel processing */
static inline double chi2_intrinsic_double(int n, const double* x, const double* y) {
    double result=0;
    const __m128d eps = _mm_set1_pd(DBL_MIN);
    const __m128d zero = _mm_setzero_pd();
    __m128d chi2 = _mm_setzero_pd();    

    for ( ; n>1; n-=2) {
        const __m128d a = _mm_loadu_pd(x);
        const __m128d b = _mm_loadu_pd(y);
	x+=2;
	y+=2;
        const __m128d a_plus_b = _mm_add_pd(a,b);
        const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps);
        const __m128d a_minus_b = _mm_sub_pd(a,b);
        const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b);
        const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps);
        chi2 = _mm_add_pd(chi2, quotient);
    }
    const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1));
    const __m128d sum = _mm_add_pd(chi2, shuffle);
// with SSE3, we could use hadd_pd, but the difference is negligible 

    _mm_store_sd(&result,sum);
    _mm_empty();
    if (n)
        result += chi2_baseline_double(n, x, y); // remaining entries
    return result;
}
Ejemplo n.º 19
0
void pix_diff :: processYUV_MMX (imageStruct &image, imageStruct &right){
  int datasize =   image.xsize * image.ysize * image.csize;
  __m64*leftPix =  (__m64*)image.data;
  __m64*rightPix = (__m64*)right.data;

  datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);
  __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00,
			    0x40, 0x00, 0x40, 0x00);
  __m64 l, r, b;
  while (datasize--) {
    l=leftPix[datasize];
    r=rightPix[datasize];

    l=_mm_adds_pu8(l, mask);
    r=_mm_subs_pu8(r, mask);

    b  = l;
    b  = _mm_subs_pu8     (b, r);
    r  = _mm_subs_pu8     (r, l);
    b  = _mm_or_si64      (b, r);

    leftPix[datasize]=b;
  }
  _mm_empty();
}
Ejemplo n.º 20
0
int ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms,
		   int **rxdataF_comp,
		   short *ulsch_llr,
		   unsigned char symbol,
		   unsigned short nb_rb) {

  __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)];
  int i;

  if (symbol == 0)
    llr128U = (__m128i*)ulsch_llr;
 
  if (!llr128U) {
    msg("ulsch_qpsk_llr: llr is null, symbol %d, llr128=%p\n",symbol, llr128U);
    return(-1);
  }
  //  printf("qpsk llr for symbol %d (pos %d), llr offset %d\n",symbol,(symbol*frame_parms->N_RB_DL*12),llr128-(__m128i*)ulsch_llr);

  for (i=0;i<(nb_rb*3);i++) {
    *llr128U = *rxF;
    rxF++;
    llr128U++;
  }

  _mm_empty();
  _m_empty();

  return(0);

}
Ejemplo n.º 21
0
void pix_subtract :: processYUV_MMX (imageStruct &image, imageStruct &right){
  int datasize =   image.xsize * image.ysize * image.csize;
  __m64*leftPix =  (__m64*)image.data;
  __m64*rightPix = (__m64*)right.data;

  datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0);
  __m64 null64 = _mm_setzero_si64();
  __m64 offset = _mm_setr_pi16(0x80, 0x00, 0x80, 0x00);
  __m64 l0, l1, r0, r1;
  while (datasize--) {
    l1=leftPix[datasize];
    r1=rightPix[datasize];

    l0=_mm_unpacklo_pi8 (l1, null64);
    r0=_mm_unpacklo_pi8 (r1, null64);
    l1=_mm_unpackhi_pi8 (l1, null64);
    r1=_mm_unpackhi_pi8 (r1, null64);

    l0=_mm_adds_pu16(l0, offset);
    l1=_mm_adds_pu16(l1, offset);

    l0=_mm_subs_pu16(l0, r0);
    l1=_mm_subs_pu16(l1, r1);

    leftPix[datasize]=_mm_packs_pu16(l0, l1);
  }
  _mm_empty();
}
Ejemplo n.º 22
0
void memset32(void *pDest, uint32_t value, size_t numBytes)
{
	// numBytes must be a multiple of 16 -- use memset() for general purpose
	VIZ_ASSERT(!(numBytes & 15));

	// an 8-byte boundary gaurantees correctly aligned writes
	VIZ_ASSERT(!(reinterpret_cast<uint32_t>(pDest) & 7));

	if (numBytes>0)
	{
		__asm
		{
			mov       edi, pDest
			movq      mm0, value
			punpckldq mm0, mm0
			mov       ecx, numBytes
			shr       ecx, 4
		_loop:
			movntq    [edi], mm0
			movntq    [edi+8], mm0
			add       edi, 16
			dec       ecx
			jnz       _loop
		}		

		_mm_empty();
	}
Ejemplo n.º 23
0
void PP_Test::process(int threadIndex, int threadCount, PuresoftFBO* frame, PuresoftFBO* depth)
{
	// buffer entry for this thread
	uintptr_t frameBuffer = (uintptr_t)frame->getBuffer();
	int scanline = frame->getScanline();
	frameBuffer += threadIndex * scanline;

	const unsigned char f[] = {50,50,50,50,50,50,50,50};
	__asm{
		lea eax,f
		movq mm2,[eax]
	}

	for(int y = threadIndex; 
		y < frame->getHeight(); 
		y += threadCount)
	{
		PURESOFTBGRA* row = (PURESOFTBGRA*)frameBuffer;
		for(int x = 0; x < frame->getWidth(); x+=2)
		{
			__asm{
				mov eax,1
				movd mm1,eax
				mov edx,row
				movq mm0,[edx]
				paddb mm0,mm2
				movntq [edx],mm0
			}
			row+=2;
		}

		frameBuffer += scanline * threadCount;
	}
	_mm_empty();
}
Ejemplo n.º 24
0
//compute average channel_level on each (TX,RX) antenna pair
int dl_channel_level(s16 *dl_ch,
		     LTE_DL_FRAME_PARMS *frame_parms) {

  s16 rb;
  __m128i *dl_ch128;
  int avg;

      //clear average level
  avg128F = _mm_xor_si128(avg128F,avg128F);
  dl_ch128=(__m128i *)dl_ch;

  for (rb=0;rb<frame_parms->N_RB_DL;rb++) {
    
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0]));
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1]));
    avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2]));
    
    dl_ch128+=3;	
    
  }

  avg = (((int*)&avg128F)[0] + 
	 ((int*)&avg128F)[1] + 
	 ((int*)&avg128F)[2] + 
	 ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12);
  


  _mm_empty();
  _m_empty();
  
  return(avg);
}
Ejemplo n.º 25
0
static void replace_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height)
{
  int mod8_width = width / 8 * 8;
  __m64 luma_mask = _mm_set1_pi16(0x00FF);
#pragma warning(push)
#pragma warning(disable: 4309)
  __m64 chroma_mask = _mm_set1_pi16(0xFF00);
#pragma warning(pop)

  for(int y = 0; y < height; y++) {
    for(int x = 0; x < mod8_width; x+=8) {
      __m64 s = *reinterpret_cast<const __m64*>(src+x);
      __m64 l = *reinterpret_cast<const __m64*>(luma+x);

      __m64 s_chroma = _mm_and_si64(s, chroma_mask);
      __m64 l_luma = _mm_and_si64(l, luma_mask);

      __m64 result = _mm_or_si64(s_chroma, l_luma);

      *reinterpret_cast<__m64*>(src+x) = result;
    }

    for (int x = mod8_width; x < width; x+=2) {
      src[x] = luma[x];
    }
    src += pitch;
    luma += luma_pitch;
  }
  _mm_empty();
}
Ejemplo n.º 26
0
void DCTFFTW::DCTBytes2D(const unsigned char *srcp, int src_pitch, unsigned char *dctp, int dct_pitch)
{
	_mm_empty ();
	Bytes2Float (srcp, src_pitch, fSrc);
	fftwf_execute_r2r_addr(dctplan, fSrc, fSrcDCT);
	Float2Bytes (dctp, dct_pitch, fSrcDCT);

}
/* Test the 64-bit form */
static void
ssse3_test_pmaddubsw (int *i1, int *i2, int *r)
{
  __m64 t1 = *(__m64 *) i1;
  __m64 t2 = *(__m64 *) i2;
  *(__m64 *) r = _mm_maddubs_pi16 (t1, t2);
  _mm_empty ();
}
Ejemplo n.º 28
0
/* Test the 64-bit form */
static void
ssse3_test_pshufb (int *i1, int *i2, int *r)
{
    __m64 t1 = *(__m64 *) i1;
    __m64 t2 = *(__m64 *) i2;
    *(__m64 *)r = _mm_shuffle_pi8 (t1, t2);
    _mm_empty ();
}
Ejemplo n.º 29
0
void calc_LBP11_sse(IplImage * src, IplImage * dst)
{
    for (int x = 0; x < src->width; x+=14)
    {
        calc_lbp_16_strip(src, dst, x);
    }
    _mm_empty();
}
Ejemplo n.º 30
0
/* Test the 64-bit form */
static void
ssse3_test_phaddd (int *i1, int *i2, int *r)
{
  __m64 t1 = *(__m64 *) i1;
  __m64 t2 = *(__m64 *) i2;
  *(__m64 *) r = _mm_hadd_pi32 (t1, t2);
  _mm_empty();
}