void pix_offset :: processRGBAMMX(imageStruct &image) { char R = m_offset[chRed]; char G = m_offset[chGreen]; char B = m_offset[chBlue]; char A = m_offset[chAlpha]; register int pixsize = (image.ysize * image.xsize)>>1; register __m64 offset_64 = _mm_setr_pi8(R, G, B, A, R, G, B, A); register __m64*data_p= (__m64*)image.data; _mm_empty(); if(m_saturate) { while(pixsize--) { data_p[0]=_mm_adds_pu8(data_p[0], offset_64); data_p++; } } else { while(pixsize--) { data_p[0]=_mm_add_pi8(data_p[0], offset_64); data_p++; } } _mm_empty(); }
unsigned int mmx_hash_bucket_data(unsigned char *key, int size, int NoOfItems) { char *p, *end; __m64 v1, v2, s; int val; if (size < 8) return(fnv_data2bucket(key, size, NoOfItems)); p=key; end=key+size; _mm_empty(); // emms v1=_mm_set1_pi32(FNV_INIT_VAL); while ((end-p) > 7) { v2=_mm_setr_pi32(*p,*(p+4)); v1=_mm_add_pi16(v1, v2); v1=_mm_slli_pi32(v1, 3); p+=8; } val=_mm_cvtsi64_si32(v1); _mm_empty(); // emms if (val < 0) val=1-val; val =val % NoOfItems; return(val); }
void OL_BlendImage::BlendImageMask(Image444* base, Image444* overlay, Image444* mask) { BYTE* baseY = base->GetPtr(PLANAR_Y); BYTE* baseU = base->GetPtr(PLANAR_U); BYTE* baseV = base->GetPtr(PLANAR_V); BYTE* ovY = overlay->GetPtr(PLANAR_Y); BYTE* ovU = overlay->GetPtr(PLANAR_U); BYTE* ovV = overlay->GetPtr(PLANAR_V); BYTE* maskY = mask->GetPtr(PLANAR_Y); BYTE* maskU = mask->GetPtr(PLANAR_U); BYTE* maskV = mask->GetPtr(PLANAR_V); int w = base->w(); int h = base->h(); if (opacity == 256) { if (env->GetCPUFlags() & CPUF_SSE2) { overlay_blend_sse2_plane_masked(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h); overlay_blend_sse2_plane_masked(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h); overlay_blend_sse2_plane_masked(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { overlay_blend_mmx_plane_masked(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h); overlay_blend_mmx_plane_masked(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h); overlay_blend_mmx_plane_masked(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h); _mm_empty(); } else #endif { overlay_blend_c_plane_masked(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h); overlay_blend_c_plane_masked(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h); overlay_blend_c_plane_masked(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h); } } else { if (env->GetCPUFlags() & CPUF_SSE2) { overlay_blend_sse2_plane_masked_opacity(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); overlay_blend_sse2_plane_masked_opacity(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); overlay_blend_sse2_plane_masked_opacity(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); } else #ifdef X86_32 if (env->GetCPUFlags() & CPUF_MMX) { overlay_blend_mmx_plane_masked_opacity(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); overlay_blend_mmx_plane_masked_opacity(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); overlay_blend_mmx_plane_masked_opacity(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); _mm_empty(); } else #endif { overlay_blend_c_plane_masked_opacity(baseY, ovY, maskY, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); overlay_blend_c_plane_masked_opacity(baseU, ovU, maskU, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); overlay_blend_c_plane_masked_opacity(baseV, ovV, maskV, base->pitch, overlay->pitch, mask->pitch, w, h, opacity); } } }
extern "C" void __cdecl memcpyMMX(void *Dest, void *Src, size_t nBytes) { _mm_empty(); __asm { mov esi, dword ptr[Src] mov edi, dword ptr[Dest] mov ecx, nBytes shr ecx, 6 // nBytes / 64 cmp ecx, 0 je Myloop align 8 CopyLoop: movq mm0, qword ptr[esi] movq mm1, qword ptr[esi+8*1] movq mm2, qword ptr[esi+8*2] movq mm3, qword ptr[esi+8*3] movq mm4, qword ptr[esi+8*4] movq mm5, qword ptr[esi+8*5] movq mm6, qword ptr[esi+8*6] movq mm7, qword ptr[esi+8*7] movq qword ptr[edi], mm0 movq qword ptr[edi+8*1], mm1 movq qword ptr[edi+8*2], mm2 movq qword ptr[edi+8*3], mm3 movq qword ptr[edi+8*4], mm4 movq qword ptr[edi+8*5], mm5 movq qword ptr[edi+8*6], mm6 movq qword ptr[edi+8*7], mm7 add esi, 64 add edi, 64 loop CopyLoop // emms align 8 Myloop: mov ecx, nBytes and ecx, 63 cmp ecx, 0 je EndCopyLoop ;align 8 ;CopyLoop2: mov dl, byte ptr[esi] mov byte ptr[edi], dl ; inc esi ; inc edi ; dec ecx ; jne CopyLoop2 rep movsb EndCopyLoop: } _mm_empty(); }
void pix_offset :: processYUVMMX(imageStruct &image) { register int pixsize = (image.ysize * image.xsize)>>2; register __m64 offset_64 = _mm_setr_pi8(U, Y, V, Y, U, Y, V, Y); register __m64*data_p= (__m64*)image.data; _mm_empty(); while(pixsize--) { data_p[0]=_mm_add_pi8(data_p[0], offset_64); data_p++; } _mm_empty(); }
void pix_multiply :: processRGBA_MMX(imageStruct &image, imageStruct &right) { int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 l0, r0, l1, r1; __m64 null64 = _mm_setzero_si64(); while(datasize--) { l1=leftPix [datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8(l1, null64); r0=_mm_unpacklo_pi8(r1, null64); l1=_mm_unpackhi_pi8(l1, null64); r1=_mm_unpackhi_pi8(r1, null64); l0=_mm_mullo_pi16 (l0, r0); l1=_mm_mullo_pi16 (l1, r1); l0=_mm_srli_pi16(l0, 8); l1=_mm_srli_pi16(l1, 8); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
static long conv_rgba16_rgbaF (const uint16_t *src, float *dst, long samples) { long i = 0; if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0) { long n = (samples / 2) * 2; const __m128i *s = (const __m128i*) src; __v4sf *d = (__v4sf*) dst; for (; i < n / 2; i++) { /* Expand shorts to ints by loading zero in the high bits */ const __m128i t0 = _mm_unpacklo_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); const __m128i t1 = _mm_unpackhi_epi16 (s[i + 0], (__m128i)_mm_setzero_ps()); /* Convert to float */ const __m128 u0 = _mm_cvtepi32_ps (t0); const __m128 u1 = _mm_cvtepi32_ps (t1); const __v4sf rgba0 = u0 * u16_float; const __v4sf rgba1 = u1 * u16_float; d[2 * i + 0] = rgba0; d[2 * i + 1] = rgba1; } _mm_empty(); } for (i *= 2 * 4; i != 4 * samples; i++) dst[i] = src[i] * (1.f / 65535); return samples; }
/* use compiler intrinsics for 4x parallel processing */ static inline float chi2_intrinsic_aligned_float(int n, const float* x, const float* y) { float result=0; const __m128 eps = _mm_set1_ps(FLT_MIN); const __m128 zero = _mm_setzero_ps(); __m128 chi2 = _mm_setzero_ps(); for (; n>3; n-=4) { const __m128 a = _mm_loadu_ps(x); const __m128 b = _mm_loadu_ps(y); const __m128 a_plus_eps = _mm_add_ps(a,eps); const __m128 a_plus_b_plus_eps = _mm_add_ps(a_plus_eps,b); const __m128 a_minus_b = _mm_sub_ps(a,b); const __m128 a_minus_b_sq = _mm_mul_ps(a_minus_b, a_minus_b); const __m128 prod = _mm_div_ps(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_ps(chi2, prod); x+=4; y+=4; } const __m128 shuffle1 = _mm_shuffle_ps(chi2, chi2, _MM_SHUFFLE(1,0,3,2)); const __m128 sum1 = _mm_add_ps(chi2, shuffle1); const __m128 shuffle2 = _mm_shuffle_ps(sum1, sum1, _MM_SHUFFLE(2,3,0,1)); const __m128 sum2 = _mm_add_ps(sum1, shuffle2); // with SSE3, we could use hadd_ps, but the difference is negligible _mm_store_ss(&result,sum2); _mm_empty(); if (n) result += chi2_baseline_float(n, x, y); // remaining 1-3 entries return result; }
void Haar::transcols(char** dest, char** sour, unsigned int w, unsigned int h) const { unsigned int h2 = h / 2; for (unsigned int k = 0; k < h2; k++) { __m64 *mlo = (__m64 *) & dest[k][0]; __m64 *mhi = (__m64 *) & dest[k+h2][0]; __m64 *even = (__m64 *) & sour[2*k][0]; __m64 *odd = (__m64 *) & sour[2*k+1][0]; for (unsigned int x = 0; x < w / 8; x++) { addsub(*even, *odd, mlo, mhi); even++; odd++; mlo++; mhi++; } } _mm_empty(); //odd remainder for (unsigned int x = w - (w % 8); x < w; x++) { for (unsigned int k = 0; k < h2; k++) { dest[k][x] = char(((int)sour[2*k][x] + (int)sour[2*k+1][x]) / 2); dest[k+h2][x] = char(((int)sour[2*k][x] - (int)sour[2*k+1][x]) / 2); } } }
long dotp(short a[], short b[]) { int i; __m64 mm0, mm1, mm2, mm3, mm4; short suml[4]; // don't init sum from C - this confuses the GCC! short sumh[4]; /* mmx - Intel Pentium-MMX and above */ mm2 = _m_psubw(mm2, mm2); // set mm2 to 0 mm4 = _m_psubw(mm4, mm4); for (i = 0; i < NLMS_LEN; i += 4, a += 4, b += 4) { mm0 = _m_from_WORDs(a); mm3 = mm0; mm1 = _m_from_WORDs(b); /* Intel notation: first operand is destination */ /* GNU as notation: first operand is source */ // mm0 = _mm_mullo_pi16 (mm0, mm1); mm3 = _mm_mulhi_pi16 (mm3, mm1); // mm2 = _mm_add_pi16(mm2, mm0); mm4 = _mm_add_pi16(mm4, mm3); } _m_from_WORDs(suml) = mm2; _m_from_WORDs(sumh) = mm4; _mm_empty(); return suml[0] + suml[1] + suml[2] + suml[3] + 65536 * (sumh[0] + sumh[1] + sumh[2] + sumh[3]); }
void multadd_complex_vector_real_scalar(int16_t *x, int16_t alpha, int16_t *y, uint8_t zero_flag, uint32_t N) { simd_q15_t alpha_128,*x_128=(simd_q15_t *)x,*y_128=(simd_q15_t*)y; int n; alpha_128 = set1_int16(alpha); if (zero_flag == 1) for (n=0; n<N>>2; n++) { y_128[n] = mulhi_int16(x_128[n],alpha_128); } else for (n=0; n<N>>2; n++) { y_128[n] = adds_int16(y_128[n],mulhi_int16(x_128[n],alpha_128)); } _mm_empty(); _m_empty(); }
int complex_conjugate(int16_t *x1, int16_t *y, uint32_t N) { uint32_t i; // loop counter simd_q15_t *x1_128; simd_q15_t *y_128; int16_t x2[8] __attribute__((aligned(16))) = {1,-1,1,-1,1,-1,1,-1}; simd_q15_t *x2_128 = (simd_q15_t*)&x2[0]; x1_128 = (simd_q15_t *)&x1[0]; y_128 = (simd_q15_t *)&y[0]; // we compute 4 cpx multiply for each loop for(i=0; i<(N>>3); i++) { y_128[0] = mullo_int16(x1_128[0],*x2_128); y_128[1] = mullo_int16(x1_128[1],*x2_128); y_128[2] = mullo_int16(x1_128[2],*x2_128); y_128[3] = mullo_int16(x1_128[3],*x2_128); x1_128+=4; y_128 +=4; } _mm_empty(); _m_empty(); return(0); }
void GOST34112012Update_sse41(GOST34112012Context* ctx, const unsigned char* data, std::size_t len) { std::size_t chunksize; const union uint512_u* d = reinterpret_cast<const union uint512_u*>(data); while (len > 63 && ctx->bufsize == 0) { stage2(ctx, d); data += 64; len -= 64; } while (len) { chunksize = 64 - ctx->bufsize; if (chunksize > len) { chunksize = len; } std::memcpy(&ctx->buffer.BYTE[ctx->bufsize], data, chunksize); ctx->bufsize += chunksize; len -= chunksize; data += chunksize; if (ctx->bufsize == 64) { stage2(ctx, &ctx->buffer); ctx->bufsize = 0; } } _mm_empty(); }
HRESULT CBaseVideoFilter::Receive(IMediaSample* pIn) { #ifndef _WIN64 // TODOX64 : fixme! _mm_empty(); // just for safety #endif CAutoLock cAutoLock(&m_csReceive); HRESULT hr; AM_SAMPLE2_PROPERTIES* const pProps = m_pInput->SampleProps(); if (pProps->dwStreamId != AM_STREAM_MEDIA) { return m_pOutput->Deliver(pIn); } AM_MEDIA_TYPE* pmt; if (SUCCEEDED(pIn->GetMediaType(&pmt)) && pmt) { CMediaType mt(*pmt); m_pInput->SetMediaType(&mt); DeleteMediaType(pmt); } if (FAILED(hr = Transform(pIn))) { return hr; } return S_OK; }
/* Combines unpack and accumulate */ void vector_accumulate_8bit(float *out, const char *in, int n) { #ifdef FOLD_USE_INTRINSICS __m128 in_, out_, tmp_; float ftmp; int ii; for (ii = 0 ; ii < (n & -16) ; ii += 16) { __builtin_prefetch(out + 64, 1, 0); __builtin_prefetch(in + 64, 0, 0); out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < (n & -4) ; ii += 4) { out_ = _MM_LOAD_PS(out); in_ = _mm_cvtpi8_ps(*((__m64 *)in)); tmp_ = _mm_add_ps(out_, in_); _MM_STORE_PS(out, tmp_); in += 4; out += 4; } for (; ii < n ; ii++) { // Cast these without intrinsics ftmp = (float)(*in); out_ = _mm_load_ss(out); in_ = _mm_load_ss(&ftmp); tmp_ = _mm_add_ss(out_, in_); _mm_store_ss(out, tmp_); in += 1; out += 1; } _mm_empty(); #else int i; for (i=0; i<n; i++) { out[i] += (float)in[i]; } #endif }
///////////////////////////////////////////////transforms///////////////////////////////////////////////////////////////////// void Haar::transrows(char** dest, char** sour, unsigned int w, unsigned int h) const { unsigned int w2 = w / 2; __m64 m00FF; m00FF.m64_u64 = 0x00FF00FF00FF00FF; for (unsigned int y = 0; y < h; y++) { __m64 *mlo = (__m64 *) & dest[y][0]; __m64 *mhi = (__m64 *) & dest[y][w2]; __m64 *msour = (__m64 *) & sour[y][0]; for (unsigned int k = 0; k < w2 / 8; k++) { //k<w2/8 k=8*k __m64 even = _mm_packs_pu16(_mm_and_si64(*msour, m00FF), _mm_and_si64(*(msour + 1), m00FF)); //even coeffs __m64 odd = _mm_packs_pu16(_mm_srli_pi16(*msour, 8), _mm_srli_pi16(*(msour + 1), 8)); //odd coeffs addsub(even, odd, mlo++, mhi++); msour += 2; } if (w2 % 8) { for (unsigned int k = w2 - (w2 % 8); k < w2; k++) { dest[y][k] = char(((int)sour[y][2*k] + (int)sour[y][2*k+1]) / 2); dest[y][k+w2] = char(((int)sour[y][2*k] - (int)sour[y][2*k+1]) / 2); } } } _mm_empty(); }
STDMETHODIMP TffdshowEnc::deliverEncodedSample(const TmediaSample &sample, TencFrameParams ¶ms) { _mm_empty(); params.outlength = coSettings->storeAVI ? params.length : 0; if (mux) { mux->writeFrame(sample, params.length, params); } if (params.keyframe) { keyspacing = 0; } totalsize += params.length; if (outputdebug || outputdebugfile) { dbgWrite(_l("1st-pass: size:%d total-kbytes:%d %s quant:%d %s kblocks:%d mblocks:%d\n"), params.length, int(totalsize / 1024), FRAME_TYPE::name(params.frametype), params.quant, encQuantTypes[params.quanttype], params.kblks, params.mblks); } params.framenum++; keyspacing++; encStats.add(params); if (h_graph) { PostMessage(h_graph, TencStats::MSG_FF_FRAME, params.length, (params.quant << 20) | params.frametype); } return ICERR_OK; }
/* use compiler intrinsics for 2x parallel processing */ static inline double chi2_intrinsic_double(int n, const double* x, const double* y) { double result=0; const __m128d eps = _mm_set1_pd(DBL_MIN); const __m128d zero = _mm_setzero_pd(); __m128d chi2 = _mm_setzero_pd(); for ( ; n>1; n-=2) { const __m128d a = _mm_loadu_pd(x); const __m128d b = _mm_loadu_pd(y); x+=2; y+=2; const __m128d a_plus_b = _mm_add_pd(a,b); const __m128d a_plus_b_plus_eps = _mm_add_pd(a_plus_b,eps); const __m128d a_minus_b = _mm_sub_pd(a,b); const __m128d a_minus_b_sq = _mm_mul_pd(a_minus_b, a_minus_b); const __m128d quotient = _mm_div_pd(a_minus_b_sq, a_plus_b_plus_eps); chi2 = _mm_add_pd(chi2, quotient); } const __m128d shuffle = _mm_shuffle_pd(chi2, chi2, _MM_SHUFFLE2(0,1)); const __m128d sum = _mm_add_pd(chi2, shuffle); // with SSE3, we could use hadd_pd, but the difference is negligible _mm_store_sd(&result,sum); _mm_empty(); if (n) result += chi2_baseline_double(n, x, y); // remaining entries return result; }
void pix_diff :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 mask = _mm_setr_pi8(0x40, 0x00, 0x40, 0x00, 0x40, 0x00, 0x40, 0x00); __m64 l, r, b; while (datasize--) { l=leftPix[datasize]; r=rightPix[datasize]; l=_mm_adds_pu8(l, mask); r=_mm_subs_pu8(r, mask); b = l; b = _mm_subs_pu8 (b, r); r = _mm_subs_pu8 (r, l); b = _mm_or_si64 (b, r); leftPix[datasize]=b; } _mm_empty(); }
int ulsch_qpsk_llr(LTE_DL_FRAME_PARMS *frame_parms, int **rxdataF_comp, short *ulsch_llr, unsigned char symbol, unsigned short nb_rb) { __m128i *rxF=(__m128i*)&rxdataF_comp[0][(symbol*frame_parms->N_RB_DL*12)]; int i; if (symbol == 0) llr128U = (__m128i*)ulsch_llr; if (!llr128U) { msg("ulsch_qpsk_llr: llr is null, symbol %d, llr128=%p\n",symbol, llr128U); return(-1); } // printf("qpsk llr for symbol %d (pos %d), llr offset %d\n",symbol,(symbol*frame_parms->N_RB_DL*12),llr128-(__m128i*)ulsch_llr); for (i=0;i<(nb_rb*3);i++) { *llr128U = *rxF; rxF++; llr128U++; } _mm_empty(); _m_empty(); return(0); }
void pix_subtract :: processYUV_MMX (imageStruct &image, imageStruct &right){ int datasize = image.xsize * image.ysize * image.csize; __m64*leftPix = (__m64*)image.data; __m64*rightPix = (__m64*)right.data; datasize=datasize/sizeof(__m64)+(datasize%sizeof(__m64)!=0); __m64 null64 = _mm_setzero_si64(); __m64 offset = _mm_setr_pi16(0x80, 0x00, 0x80, 0x00); __m64 l0, l1, r0, r1; while (datasize--) { l1=leftPix[datasize]; r1=rightPix[datasize]; l0=_mm_unpacklo_pi8 (l1, null64); r0=_mm_unpacklo_pi8 (r1, null64); l1=_mm_unpackhi_pi8 (l1, null64); r1=_mm_unpackhi_pi8 (r1, null64); l0=_mm_adds_pu16(l0, offset); l1=_mm_adds_pu16(l1, offset); l0=_mm_subs_pu16(l0, r0); l1=_mm_subs_pu16(l1, r1); leftPix[datasize]=_mm_packs_pu16(l0, l1); } _mm_empty(); }
void memset32(void *pDest, uint32_t value, size_t numBytes) { // numBytes must be a multiple of 16 -- use memset() for general purpose VIZ_ASSERT(!(numBytes & 15)); // an 8-byte boundary gaurantees correctly aligned writes VIZ_ASSERT(!(reinterpret_cast<uint32_t>(pDest) & 7)); if (numBytes>0) { __asm { mov edi, pDest movq mm0, value punpckldq mm0, mm0 mov ecx, numBytes shr ecx, 4 _loop: movntq [edi], mm0 movntq [edi+8], mm0 add edi, 16 dec ecx jnz _loop } _mm_empty(); }
void PP_Test::process(int threadIndex, int threadCount, PuresoftFBO* frame, PuresoftFBO* depth) { // buffer entry for this thread uintptr_t frameBuffer = (uintptr_t)frame->getBuffer(); int scanline = frame->getScanline(); frameBuffer += threadIndex * scanline; const unsigned char f[] = {50,50,50,50,50,50,50,50}; __asm{ lea eax,f movq mm2,[eax] } for(int y = threadIndex; y < frame->getHeight(); y += threadCount) { PURESOFTBGRA* row = (PURESOFTBGRA*)frameBuffer; for(int x = 0; x < frame->getWidth(); x+=2) { __asm{ mov eax,1 movd mm1,eax mov edx,row movq mm0,[edx] paddb mm0,mm2 movntq [edx],mm0 } row+=2; } frameBuffer += scanline * threadCount; } _mm_empty(); }
//compute average channel_level on each (TX,RX) antenna pair int dl_channel_level(s16 *dl_ch, LTE_DL_FRAME_PARMS *frame_parms) { s16 rb; __m128i *dl_ch128; int avg; //clear average level avg128F = _mm_xor_si128(avg128F,avg128F); dl_ch128=(__m128i *)dl_ch; for (rb=0;rb<frame_parms->N_RB_DL;rb++) { avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[0],dl_ch128[0])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[1],dl_ch128[1])); avg128F = _mm_add_epi32(avg128F,_mm_madd_epi16(dl_ch128[2],dl_ch128[2])); dl_ch128+=3; } avg = (((int*)&avg128F)[0] + ((int*)&avg128F)[1] + ((int*)&avg128F)[2] + ((int*)&avg128F)[3])/(frame_parms->N_RB_DL*12); _mm_empty(); _m_empty(); return(avg); }
static void replace_luma_yuy2_mmx(BYTE *src, const BYTE *luma, int pitch, int luma_pitch,int width, int height) { int mod8_width = width / 8 * 8; __m64 luma_mask = _mm_set1_pi16(0x00FF); #pragma warning(push) #pragma warning(disable: 4309) __m64 chroma_mask = _mm_set1_pi16(0xFF00); #pragma warning(pop) for(int y = 0; y < height; y++) { for(int x = 0; x < mod8_width; x+=8) { __m64 s = *reinterpret_cast<const __m64*>(src+x); __m64 l = *reinterpret_cast<const __m64*>(luma+x); __m64 s_chroma = _mm_and_si64(s, chroma_mask); __m64 l_luma = _mm_and_si64(l, luma_mask); __m64 result = _mm_or_si64(s_chroma, l_luma); *reinterpret_cast<__m64*>(src+x) = result; } for (int x = mod8_width; x < width; x+=2) { src[x] = luma[x]; } src += pitch; luma += luma_pitch; } _mm_empty(); }
void DCTFFTW::DCTBytes2D(const unsigned char *srcp, int src_pitch, unsigned char *dctp, int dct_pitch) { _mm_empty (); Bytes2Float (srcp, src_pitch, fSrc); fftwf_execute_r2r_addr(dctplan, fSrc, fSrcDCT); Float2Bytes (dctp, dct_pitch, fSrcDCT); }
/* Test the 64-bit form */ static void ssse3_test_pmaddubsw (int *i1, int *i2, int *r) { __m64 t1 = *(__m64 *) i1; __m64 t2 = *(__m64 *) i2; *(__m64 *) r = _mm_maddubs_pi16 (t1, t2); _mm_empty (); }
/* Test the 64-bit form */ static void ssse3_test_pshufb (int *i1, int *i2, int *r) { __m64 t1 = *(__m64 *) i1; __m64 t2 = *(__m64 *) i2; *(__m64 *)r = _mm_shuffle_pi8 (t1, t2); _mm_empty (); }
void calc_LBP11_sse(IplImage * src, IplImage * dst) { for (int x = 0; x < src->width; x+=14) { calc_lbp_16_strip(src, dst, x); } _mm_empty(); }
/* Test the 64-bit form */ static void ssse3_test_phaddd (int *i1, int *i2, int *r) { __m64 t1 = *(__m64 *) i1; __m64 t2 = *(__m64 *) i2; *(__m64 *) r = _mm_hadd_pi32 (t1, t2); _mm_empty(); }