static int forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc) { register __m128 mpv, dpv, ipv; /* previous row values */ register __m128 sv; /* temp storage of 1 curr row value in progress */ register __m128 dcv; /* delayed storage of D(i,q+1) */ register __m128 xEv; /* E state: keeps max for Mk->E as we go */ register __m128 xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ __m128 zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over quads 0..nq-1 */ int j; /* counter over DD iterations (4 is full serialization) */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ __m128 *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ __m128 *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ __m128 *rp; /* will point at om->rfv[x] for residue x[i] */ __m128 *tp; /* will point into (and step thru) om->tfv */ /* Initialization. */ ox->M = om->M; ox->L = L; ox->has_own_scales = TRUE; /* all forward matrices control their own scalefactors */ zerov = _mm_setzero_ps(); for (q = 0; q < Q; q++) MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov; xE = ox->xmx[p7X_E] = 0.; xN = ox->xmx[p7X_N] = 1.; xJ = ox->xmx[p7X_J] = 0.; xB = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE]; xC = ox->xmx[p7X_C] = 0.; ox->xmx[p7X_SCALE] = 1.0; ox->totscale = 0.0; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=0, width=8, precision=5*/ #endif for (i = 1; i <= L; i++) { dpp = dpc; dpc = ox->dpf[do_full * i]; /* avoid conditional, use do_full as kronecker delta */ rp = om->rfv[dsq[i]]; tp = om->tfv; dcv = _mm_setzero_ps(); xEv = _mm_setzero_ps(); xBv = _mm_set1_ps(xB); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. Shift zeros on. */ mpv = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov); dpv = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov); ipv = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov); for (q = 0; q < Q; q++) { /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */ sv = _mm_mul_ps(xBv, *tp); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++; sv = _mm_mul_ps(sv, *rp); rp++; xEv = _mm_add_ps(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMO(dpc,q) = sv; DMO(dpc,q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = _mm_mul_ps(sv, *tp); tp++; /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */ sv = _mm_mul_ps(mpv, *tp); tp++; IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++; } /* Now the DD paths. We would rather not serialize them but * in an accurate Forward calculation, we have few options. */ /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ /* We're almost certainly're obligated to do at least one complete * DD path to be sure: */ dcv = esl_sse_rightshift_ps(dcv, zerov); DMO(dpc,0) = zerov; tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q)); dcv = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */ } /* now. on small models, it seems best (empirically) to just go * ahead and serialize. on large models, we can do a bit better, * by testing for when dcv (DD path) accrued to DMO(q) is below * machine epsilon for all q, in which case we know DMO(q) are all * at their final values. The tradeoff point is (empirically) somewhere around M=100, * at least on my desktop. We don't worry about the conditional here; * it's outside any inner loops. */ if (om->M < 100) { /* Fully serialized version */ for (j = 1; j < 4; j++) { dcv = esl_sse_rightshift_ps(dcv, zerov); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { /* note, extend dcv, not DMO(q); only adding DD paths now */ DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q)); dcv = _mm_mul_ps(dcv, *tp); tp++; } } } else { /* Slightly parallelized version, but which incurs some overhead */ for (j = 1; j < 4; j++) { register __m128 cv; /* keeps track of whether any DD's change DMO(q) */ dcv = esl_sse_rightshift_ps(dcv, zerov); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ cv = zerov; for (q = 0; q < Q; q++) { /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */ sv = _mm_add_ps(dcv, DMO(dpc,q)); cv = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); DMO(dpc,q) = sv; /* store new DMO(q) */ dcv = _mm_mul_ps(dcv, *tp); tp++; /* note, extend dcv, not DMO(q) */ } if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */ } } /* Add D's to xEv */ for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv); /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */ /* The following incantation is a horizontal sum of xEv's elements */ /* These must follow DD calculations, because D's contribute to E in Forward * (as opposed to Viterbi) */ xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1))); xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(&xE, xEv); xN = xN * om->xf[p7O_N][p7O_LOOP]; xC = (xC * om->xf[p7O_C][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_MOVE]); xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_LOOP]); xB = (xJ * om->xf[p7O_J][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Sparse rescaling. xE above threshold? trigger a rescaling event. */ if (xE > 1.0e4) /* that's a little less than e^10, ~10% of our dynamic range */ { xN = xN / xE; xC = xC / xE; xJ = xJ / xE; xB = xB / xE; xEv = _mm_set1_ps(1.0 / xE); for (q = 0; q < Q; q++) { MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv); DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv); IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv); } ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE; ox->totscale += log(xE); xE = 1.0; } else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0; /* Storage of the specials. We could've stored these already * but using xE, etc. variables makes it easy to convert this * code to O(M) memory versions just by deleting storage steps. */ ox->xmx[i*p7X_NXCELLS+p7X_E] = xE; ox->xmx[i*p7X_NXCELLS+p7X_N] = xN; ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ; ox->xmx[i*p7X_NXCELLS+p7X_B] = xB; ox->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=8, precision=5*/ #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and flip total score back to log space (nats) */ /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */ /* On an underflow (which shouldn't happen), we counterintuitively return infinity: * the effect of this is to force the caller to rescore us with full range. */ if (isnan(xC)) ESL_EXCEPTION(eslERANGE, "forward score is NaN"); else if (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)"); /* if L==0, xC *should* be 0.0; J5/118 */ else if (isinf(xC) == 1) ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]); return eslOK; }
void BrushToolEdit::drawSmoothen(const QPoint &pt, float amount) { Terrain *tip = tool->tip(pt).data(); // compute affected rectangle QRect dirtyRect(pt, tip->size()); dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size())); if (!dirtyRect.isValid()) { return; } edit->beginEdit(dirtyRect, terrain); QSize tSize = terrain->size(); QSize tipSize = tip->size(); QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6); TemporaryBuffer<float> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 4); TemporaryBuffer<float> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 4); TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4); for (int y = 0; y < blurBufferSize.height(); ++y) { int cy = y + dirtyRect.top() - 3; cy = std::max(std::min(cy, tSize.height() - 1), 0); for (int x = 0; x < blurBufferSize.width(); ++x) { int cx = x + dirtyRect.left() - 3; cx = std::max(std::min(cx, tSize.width() - 1), 0); blurBuffer1[x + y * blurBufferSize.width()] = terrain->landform(cx, cy); } } for (int y = 0; y < blurBufferSize.height(); ++y) { int cy = y + dirtyRect.top() - 3; int ty = cy - pt.y(); if (ty >= 0 && ty < tipSize.height()) { for (int x = 0; x < blurBufferSize.width(); ++x) { int cx = x + dirtyRect.left() - 3; int tx = cx - pt.x(); tipBuffer[x + y * blurBufferSize.width()] = tx >= 0 && tx < tipSize.width() ? tip->landform(tx, ty) * amount : 0.f; } } else { std::fill(&tipBuffer[y * blurBufferSize.width()], &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f); } } // apply horizontal blur for (int y = 0; y < blurBufferSize.height(); ++y) { float *inBuf = blurBuffer1 + y * blurBufferSize.width(); float *outBuf = blurBuffer2 + y * blurBufferSize.width(); float *varBuf = tipBuffer + y * blurBufferSize.width(); for (int x = 3; x < blurBufferSize.width() - 3; ++x) { float variance = varBuf[x]; __m128 kernel = globalGaussianKernelTable.fetch(variance); // sample input __m128 p1 = _mm_loadu_ps(inBuf + x - 3); __m128 p2 = _mm_loadu_ps(inBuf + x); p1 = _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(0, 1, 2, 3)); auto p = _mm_add_ps(p1, p2); // apply kernel p = _mm_mul_ps(p, kernel); p = _mm_hadd_ps(p, p); p = _mm_hadd_ps(p, p); // write _mm_store_ss(outBuf + x, p); } } // apply vertical blur for (int y = 3; y < blurBufferSize.height() - 3; ++y) { float *inBuf = blurBuffer2 + y * blurBufferSize.width(); float *outBuf = blurBuffer1 + y * blurBufferSize.width(); float *varBuf = tipBuffer + y * blurBufferSize.width(); for (int x = 0; x < blurBufferSize.width() - 3; x += 4) { // fetch kernel __m128 kernel1 = globalGaussianKernelTable.fetch(varBuf[x]); __m128 kernel2 = globalGaussianKernelTable.fetch(varBuf[x + 1]); __m128 kernel3 = globalGaussianKernelTable.fetch(varBuf[x + 2]); __m128 kernel4 = globalGaussianKernelTable.fetch(varBuf[x + 3]); _MM_TRANSPOSE4_PS(kernel1, kernel2, kernel3, kernel4); // load input __m128 p1 = _mm_loadu_ps(inBuf + x); p1 = _mm_add_ps(p1, p1); __m128 p2 = _mm_loadu_ps(inBuf + x - blurBufferSize.width()); p2 = _mm_add_ps(p2, _mm_loadu_ps(inBuf + x + blurBufferSize.width())); __m128 p3 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 2); p3 = _mm_add_ps(p3, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 2)); __m128 p4 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 3); p4 = _mm_add_ps(p4, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 3)); // apply kernel p1 = _mm_mul_ps(p1, kernel1); p2 = _mm_mul_ps(p2, kernel2); p3 = _mm_mul_ps(p3, kernel3); p4 = _mm_mul_ps(p4, kernel4); p1 = _mm_add_ps(p1, p2); p3 = _mm_add_ps(p3, p4); auto p = _mm_add_ps(p1, p3); // store _mm_storeu_ps(outBuf + x, p); } } for (int y = 0; y < dirtyRect.height(); ++y) { float *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3; for (int x = 0; x < dirtyRect.width(); ++x) { int cx = x + dirtyRect.left(); int cy = y + dirtyRect.top(); terrain->landform(cx, cy) = inBuf[x]; } } edit->endEdit(terrain); }
void decomp_gamma0_minus( spinor_array src, halfspinor_array dst) { /* c <-> color, s <-> spin */ /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); /* Swap the lower components and multiply by -i*/ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0x1b); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0x1b); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0x1b); xmm9 = _mm_xor_ps(xmm6, signs24.vector); xmm10 = _mm_xor_ps(xmm7, signs24.vector); xmm11 = _mm_xor_ps(xmm8, signs24.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
__m128 addsubps(__m128 x, __m128 y){ __m128 a = _mm_add_ps(x,y); __m128 b = _mm_sub_ps(x,y); a=_mm_shuffle_ps(a,b,_MM_SHUFFLE(0,2,1,3)); return _mm_shuffle_ps(a,a,_MM_SHUFFLE()); }
struct call<tag::sort_(tag::simd_<tag::type32_, tag::sse_> ), tag::cpu_, Dummy> : callable { template<class Sig> struct result; template<class This,class A0> struct result<This(A0)> : meta::strip<A0>{};// NT2_FUNCTOR_CALL(1) { typedef typename meta::as_real<A0>::type flt; A0 a = {a0}; A0 b = {NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, a0), NT2_CAST(flt, a0)))}; comp(a, b); a = NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, a), NT2_CAST(flt, b))); b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(1, 3, 1, 3))); comp(a, b); A0 c = {NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, b), NT2_CAST(flt, b)))}; A0 d = {a}; comp(c, d); a = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, c), NT2_CAST(flt, a), NT2_SH(3, 2, 0, 0))); b = NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, b), NT2_CAST(flt, d))); b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(3, 1, 0, 2))); return b; } private : template < class T > static inline void comp(T & a,T & b) { T c = nt2::min(a, b); b = nt2::max(a, b); a = c;
// Shuffle together two vector's components template <Byte X, Byte Y, Byte Z, Byte W> inline Vector VFunction Shuffle(const Vector& vectorA, const Vector& vectorB) { return _mm_shuffle_ps(vectorA, vectorB, _MM_SHUFFLE(W, Z, Y, X)); }
void x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max) { __m128 current_max, current_min, work; // Load max and min values into all four slots of the XMM registers current_min = _mm_set1_ps(*min); current_max = _mm_set1_ps(*max); // Work input until "buf" reaches 16 byte alignment while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) { // Load the next float into the work buffer work = _mm_set1_ps(*buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf++; nframes--; } // use 64 byte prefetch for quadruple quads while (nframes >= 16) { __builtin_prefetch(buf+64,0,0); work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; nframes-=16; } // work through aligned buffers while (nframes >= 4) { work = _mm_load_ps(buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf+=4; nframes-=4; } // work through the rest < 4 samples while ( nframes > 0) { // Load the next float into the work buffer work = _mm_set1_ps(*buf); current_min = _mm_min_ps(current_min, work); current_max = _mm_max_ps(current_max, work); buf++; nframes--; } // Find min & max value in current_max through shuffle tricks work = current_min; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); work = _mm_min_ps (work, current_min); current_min = work; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); work = _mm_min_ps (work, current_min); _mm_store_ss(min, work); work = current_max; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1)); work = _mm_max_ps (work, current_max); current_max = work; work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2)); work = _mm_max_ps (work, current_max); _mm_store_ss(max, work); }
/* merge "s+s" elements and return sorted result in "dest" array TODO(d'b): replace magic numbers with macro */ inline void bitonic_merge_kernel16n(float *dest, float *a, uint32_t sa, float *b /* must not be reversed*/, uint32_t sb) { __m128 ma[4]; __m128 mb[4]; __m128 lo[4]; __m128 hi[4]; #define LOAD16(arg) \ mb[3] = _mm_load_ps(arg); \ mb[2] = _mm_load_ps(arg + 4); \ mb[1] = _mm_load_ps(arg + 8); \ mb[0] = _mm_load_ps(arg + 12); arg+=16 float *last_a = a + sa; float *last_b = b + sb; float *last_dest = dest + sa + sb; ma[0] = _mm_load_ps(a); a+=4; ma[1] = _mm_load_ps(a); a+=4; ma[2] = _mm_load_ps(a); a+=4; ma[3] = _mm_load_ps(a); a+=4; for(; dest < (last_dest - 16); dest += 16) { /* Load either a or b */ if(a < last_a) { if(b < last_b) { if(*((uint32_t*)a) < *((uint32_t*)b)) { LOAD16(a); } else { LOAD16(b); } } else { LOAD16(a); } } else { LOAD16(b); } /* Reverse *b */ mb[0] = _mm_shuffle_ps(mb[0], mb[0], 0x1b); mb[1] = _mm_shuffle_ps(mb[1], mb[1], 0x1b); mb[2] = _mm_shuffle_ps(mb[2], mb[2], 0x1b); mb[3] = _mm_shuffle_ps(mb[3], mb[3], 0x1b); lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0]))); hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0]))); lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1]))); lo[2] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2]))); hi[2] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2]))); lo[3] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3]))); hi[3] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3]))); _mm_store_ps(&dest[0], lo[0]); _mm_store_ps(&dest[4], lo[1]); _mm_store_ps(&dest[8], lo[2]); _mm_store_ps(&dest[12], lo[3]); _mm_store_ps(&dest[16], hi[2]); _mm_store_ps(&dest[20], hi[3]); _mm_store_ps(&dest[24], hi[0]); _mm_store_ps(&dest[28], hi[1]); bitonic_merge_kernel8core(dest, dest + 8); bitonic_merge_kernel8core(dest + 16, dest + 24); ma[0] = _mm_load_ps(&dest[16]); ma[1] = _mm_load_ps(&dest[20]); ma[2] = _mm_load_ps(&dest[24]); ma[3] = _mm_load_ps(&dest[28]); } }
// use MMX/SSE extensions // // (a + jb)(c + jd) = (ac - bd) + j(ad + bc) // // mm_x = { x[0].real, x[0].imag, x[1].real, x[1].imag } // mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real } // mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag } // // mm_y0 = mm_x * mm_hi // = { x[0].real * h[0].real, // x[0].imag * h[0].real, // x[1].real * h[1].real, // x[1].imag * h[1].real }; // // mm_y1 = mm_x * mm_hq // = { x[0].real * h[0].imag, // x[0].imag * h[0].imag, // x[1].real * h[1].imag, // x[1].imag * h[1].imag }; // void dotprod_cccf_execute_mmx(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // temporary buffers __m128 v; // input vector __m128 hi; // coefficients vector (real) __m128 hq; // coefficients vector (imag) __m128 ci; // output multiplication (v * hi) __m128 cq; // output multiplication (v * hq) // aligned output array float w[4] __attribute__((aligned(16))) = {0,0,0,0}; #if HAVE_PMMINTRIN_H // SSE3 __m128 s; // dot product __m128 sum = _mm_setzero_ps(); // load zeros into sum register #else // no SSE3 float wi[4] __attribute__((aligned(16))); float wq[4] __attribute__((aligned(16))); #endif // t = 4*(floor(_n/4)) unsigned int t = (n >> 2) << 2; // unsigned int i; for (i=0; i<t; i+=4) { // load inputs into register (unaligned) // {x[0].real, x[0].imag, x[1].real, x[1].imag} v = _mm_loadu_ps(&x[i]); // load coefficients into register (aligned) hi = _mm_load_ps(&_q->hi[i]); hq = _mm_load_ps(&_q->hq[i]); // compute parallel multiplications ci = _mm_mul_ps(v, hi); cq = _mm_mul_ps(v, hq); // shuffle values cq = _mm_shuffle_ps( cq, cq, _MM_SHUFFLE(2,3,0,1) ); #if HAVE_PMMINTRIN_H // SSE3: combine using addsub_ps() s = _mm_addsub_ps( ci, cq ); // accumulate sum = _mm_add_ps(sum, s); #else // no SSE3: combine using slow method // FIXME: implement slow method // unload values _mm_store_ps(wi, ci); _mm_store_ps(wq, cq); // accumulate w[0] += wi[0] - wq[0]; w[1] += wi[1] + wq[1]; w[2] += wi[2] - wq[2]; w[3] += wi[3] + wq[3]; #endif } #if HAVE_PMMINTRIN_H // unload packed array _mm_store_ps(w, sum); #endif // add in-phase and quadrature components w[0] += w[2]; // I w[1] += w[3]; // Q //float complex total = *((float complex*)w); float complex total = w[0] + w[1] * _Complex_I; // cleanup for (i=t/2; i<_q->n; i++) total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); // set return value *_y = total; }
/* elements are given in 2 arrays (4 and 4), result will be returned in the same arrays with a straight order */ inline void bitonic_sort_kernel4(float *a, float *b) { __m128 ma; __m128 mb; __m128 map; __m128 mbp; __m128 lo; __m128 hi; /* load 8 elements to sse registers */ ma = _mm_load_ps(a); mb = _mm_load_ps(b); /* In-Register sort */ map = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */ mbp = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */ mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */ mbp = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 0, 1)); /* 0x41: */ mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 2, 2, 3)); /* 0xeb: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 2, 1, 0)); /* 0xe4: */ mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 3, 2)); /* 0x4e: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */ mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */ lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp))); map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */ mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */ map = _mm_shuffle_ps(map, map, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */ mbp = _mm_shuffle_ps(mbp, mbp, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */ /* unload sorted elements to memory */ _mm_store_ps(a, map); _mm_store_ps(b, mbp); CHECK_RAWS(a, b, 4); }
/* merge 2 sorted arrays (8 elements each) to 1 sorted array return result (16 elements) in the same arrays TODO(d'b): replace magic numbers with macro */ inline void bitonic_merge_kernel8core(float *a, float *b /* must be reversed*/) { __m128 map[2]; __m128 mbp[2]; __m128 lo[2]; __m128 hi[2]; map[0] = _mm_load_ps(a); mbp[0] = _mm_load_ps(b); map[1] = _mm_load_ps(a + 4); mbp[1] = _mm_load_ps(b + 4); lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0]))); hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0]))); lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1]))); map[0] = lo[0]; map[1] = lo[1]; mbp[0] = hi[0]; mbp[1] = hi[1]; /* L1 processing */ lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xe4); map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x4e); mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xe4); mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x4e); /* L2 processing */ lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xd8); map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x8d); mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xd8); mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x8d); /* L3 processing */ lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1]))); hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1]))); map[0] = _mm_shuffle_ps(lo[1], lo[0], 0x88); map[1] = _mm_shuffle_ps(lo[1], lo[0], 0xdd); mbp[0] = _mm_shuffle_ps(hi[1], hi[0], 0x88); mbp[1] = _mm_shuffle_ps(hi[1], hi[0], 0xdd); map[0] = _mm_shuffle_ps(map[0], map[0], 0x72); map[1] = _mm_shuffle_ps(map[1], map[1], 0x72); mbp[0] = _mm_shuffle_ps(mbp[0], mbp[0], 0x72); mbp[1] = _mm_shuffle_ps(mbp[1], mbp[1], 0x72); _mm_store_ps(&a[0], map[0]); _mm_store_ps(&a[4], map[1]); _mm_store_ps(&b[0], mbp[0]); _mm_store_ps(&b[4], mbp[1]); CHECK_RAWS(a, b, 8); }
static inline int sacIsSampleDegenerate(PROSAC_HEST* p){ unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3]; /** * Pack the matches selected by the SAC algorithm. * Must be packed points[0:7] = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3} * points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3} * Gather 4 points into the vector */ __m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]); src10 = _mm_loadh_pi(src10, (__m64*)&p->src[i1]); __m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]); src32 = _mm_loadh_pi(src32, (__m64*)&p->src[i3]); __m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]); dst10 = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]); __m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]); dst32 = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]); /** * If the matches' source points have common x and y coordinates, abort. */ /** * Check: * packedPoints[0].x == packedPoints[2].x * packedPoints[0].y == packedPoints[2].y * packedPoints[1].x == packedPoints[3].x * packedPoints[1].y == packedPoints[3].y */ __m128 chkEq0 = _mm_cmpeq_ps(src10, src32); /** * Check: * packedPoints[1].x == packedPoints[2].x * packedPoints[1].y == packedPoints[2].y * packedPoints[0].x == packedPoints[3].x * packedPoints[0].y == packedPoints[3].y */ __m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32); /** * Check: * packedPoints[0].x == packedPoints[1].x * packedPoints[0].y == packedPoints[1].y * packedPoints[2].x == packedPoints[3].x * packedPoints[2].y == packedPoints[3].y */ __m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2))); /* Verify */ if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){ return 1; } /* If the matches do not satisfy the strong geometric constraint, abort. */ /** * p6420x = (p6.x, p4.x, p2.x, p0.x) * p6420y = (p6.y, p4.y, p2.y, p0.y) * p7531x = (p7.x, p5.x, p3.x, p1.x) * p7531y = (p7.y, p5.y, p3.y, p1.y) * crosssd0 = p6420y - p7531y = (cross2d0, cross0d0, cross2s0, cross0s0) * crosssd1 = p7531x - p6420x = (cross2d1, cross0d1, cross2s1, cross0s1) * crosssd2 = p6420x * p7531y - p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2) * * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0) * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1) * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2) * * dotsd0 = shufcrosssd0 * p6420x + * shufcrosssd1 * p6420y + * shufcrosssd2 * = (dotd0, dotd2, dots0, dots2) * dotsd1 = shufcrosssd0 * p7531x + * shufcrosssd1 * p7531y + * shufcrosssd2 * = (dotd1, dotd3, dots1, dots3) * * dots = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0)) * dotd = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2)) * movmaskps(dots ^ dotd) */ __m128 p3210x = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p3210y = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7654x = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7654y = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p6420x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p6420y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7531x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7531y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1)); __m128 crosssd0 = _mm_sub_ps(p6420y, p7531y); __m128 crosssd1 = _mm_sub_ps(p7531x, p6420x); __m128 crosssd2 = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x)); __m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1)); __m128 dotsd0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x), _mm_mul_ps(shufcrosssd1, p6420y)), shufcrosssd2); __m128 dotsd1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x), _mm_mul_ps(shufcrosssd1, p7531y)), shufcrosssd2); __m128 dots = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1)); __m128 dotd = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3)); //if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){ if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){ return 1; } /* Otherwise, proceed with evaluation */ _mm_store_ps((float*)&p->pkdPts[0], src10); _mm_store_ps((float*)&p->pkdPts[2], src32); _mm_store_ps((float*)&p->pkdPts[4], dst10); _mm_store_ps((float*)&p->pkdPts[6], dst32); return 0; }
static void rftfsub_128_SSE2(float *a) { const float *c = rdft_w + 32; int j1, j2, k1, k2; float wkr, wki, xr, xi, yr, yi; static const ALIGN16_BEG float ALIGN16_END k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f}; const __m128 mm_half = _mm_load_ps(k_half); // Vectorized code (four at once). // Note: commented number are indexes for the first iteration of the loop. for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) { // Load 'wk'. const __m128 c_j1 = _mm_loadu_ps(&c[ j1]); // 1, 2, 3, 4, const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]); // 28, 29, 30, 31, const __m128 wkrt = _mm_sub_ps(mm_half, c_k1); // 28, 29, 30, 31, const __m128 wkr_ = _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28, const __m128 wki_ = c_j1; // 1, 2, 3, 4, // Load and shuffle 'a'. const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]); // 2, 3, 4, 5, const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]); // 6, 7, 8, 9, const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]); // 120, 121, 122, 123, const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]); // 124, 125, 126, 127, const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2 ,0)); // 2, 4, 6, 8, const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3 ,1)); // 3, 5, 7, 9, const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0 ,2)); // 126, 124, 122, 120, const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1 ,3)); // 127, 125, 123, 121, // Calculate 'x'. const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0); // 2-126, 4-124, 6-122, 8-120, const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1); // 3-127, 5-125, 7-123, 9-121, // Calculate product into 'y'. // yr = wkr * xr - wki * xi; // yi = wkr * xi + wki * xr; const __m128 a_ = _mm_mul_ps(wkr_, xr_); const __m128 b_ = _mm_mul_ps(wki_, xi_); const __m128 c_ = _mm_mul_ps(wkr_, xi_); const __m128 d_ = _mm_mul_ps(wki_, xr_); const __m128 yr_ = _mm_sub_ps(a_, b_); // 2-126, 4-124, 6-122, 8-120, const __m128 yi_ = _mm_add_ps(c_, d_); // 3-127, 5-125, 7-123, 9-121, // Update 'a'. // a[j2 + 0] -= yr; // a[j2 + 1] -= yi; // a[k2 + 0] += yr; // a[k2 + 1] -= yi; const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_); // 2, 4, 6, 8, const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_); // 3, 5, 7, 9, const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_); // 126, 124, 122, 120, const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_); // 127, 125, 123, 121, // Shuffle in right order and store. const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n); // 2, 3, 4, 5, const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n); // 6, 7, 8, 9, const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n); // 122, 123, 120, 121, const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n); // 126, 127, 124, 125, const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3 ,2)); // 120, 121, 122, 123, const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3 ,2)); // 124, 125, 126, 127, _mm_storeu_ps(&a[0 + j2], a_j2_0n); _mm_storeu_ps(&a[4 + j2], a_j2_4n); _mm_storeu_ps(&a[122 - j2], a_k2_0n); _mm_storeu_ps(&a[126 - j2], a_k2_4n); } // Scalar code for the remaining items. for (; j2 < 64; j1 += 1, j2 += 2) { k2 = 128 - j2; k1 = 32 - j1; wkr = 0.5f - c[k1]; wki = c[j1]; xr = a[j2 + 0] - a[k2 + 0]; xi = a[j2 + 1] + a[k2 + 1]; yr = wkr * xr - wki * xi; yi = wkr * xi + wki * xr; a[j2 + 0] -= yr; a[j2 + 1] -= yi; a[k2 + 0] += yr; a[k2 + 1] -= yi; } }
static int backward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, const P7_OMX *fwd, P7_OMX *bck, float *opt_sc) { register __m128 mpv, ipv, dpv; /* previous row values */ register __m128 mcv, dcv; /* current row values */ register __m128 tmmv, timv, tdmv; /* tmp vars for accessing rotated transition scores */ register __m128 xBv; /* collects B->Mk components of B(i) */ register __m128 xEv; /* splatted E(i) */ __m128 zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 0,1..L */ int q; /* counter over quads 0..Q-1 */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ int j; /* DD segment iteration counter (4 = full serialization) */ __m128 *dpc; /* current DP row */ __m128 *dpp; /* next ("previous") DP row */ __m128 *rp; /* will point into om->rfv[x] for residue x[i+1] */ __m128 *tp; /* will point into (and step thru) om->tfv transition scores */ /* initialize the L row. */ bck->M = om->M; bck->L = L; bck->has_own_scales = FALSE; /* backwards scale factors are *usually* given by <fwd> */ dpc = bck->dpf[L * do_full]; xJ = 0.0; xB = 0.0; xN = 0.0; xC = om->xf[p7O_C][p7O_MOVE]; /* C<-T */ xE = xC * om->xf[p7O_E][p7O_MOVE]; /* E<-C, no tail */ xEv = _mm_set1_ps(xE); zerov = _mm_setzero_ps(); dcv = zerov; /* solely to silence a compiler warning */ for (q = 0; q < Q; q++) MMO(dpc,q) = DMO(dpc,q) = xEv; for (q = 0; q < Q; q++) IMO(dpc,q) = zerov; /* init row L's DD paths, 1) first segment includes xE, from DMO(q) */ tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ dpv = _mm_move_ss(DMO(dpc,Q-1), zerov); /* start leftshift: [1 5 9 13] -> [x 5 9 13] */ dpv = _mm_shuffle_ps(dpv, dpv, _MM_SHUFFLE(0,3,2,1)); /* finish leftshift:[x 5 9 13] -> [5 9 13 x] */ for (q = Q-1; q >= 0; q--) { dcv = _mm_mul_ps(dpv, *tp); tp--; DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), dcv); dpv = DMO(dpc,q); } /* 2) three more passes, only extending DD component (dcv only; no xE contrib from DMO(q)) */ for (j = 1; j < 4; j++) { tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ dcv = _mm_move_ss(dcv, zerov); /* start leftshift: [1 5 9 13] -> [x 5 9 13] */ dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1)); /* finish leftshift:[x 5 9 13] -> [5 9 13 x] */ for (q = Q-1; q >= 0; q--) { dcv = _mm_mul_ps(dcv, *tp); tp--; DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), dcv); } } /* now MD init */ tp = om->tfv + 7*Q - 3; /* <*tp> now the [4 8 12 x] Mk->Dk+1 quad */ dcv = _mm_move_ss(DMO(dpc,0), zerov); /* start leftshift: [1 5 9 13] -> [x 5 9 13] */ dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1)); /* finish leftshift:[x 5 9 13] -> [5 9 13 x] */ for (q = Q-1; q >= 0; q--) { MMO(dpc,q) = _mm_add_ps(MMO(dpc,q), _mm_mul_ps(dcv, *tp)); tp -= 7; dcv = DMO(dpc,q); } /* Sparse rescaling: same scale factors as fwd matrix */ if (fwd->xmx[L*p7X_NXCELLS+p7X_SCALE] > 1.0) { xE = xE / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xN = xN / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xC = xC / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xJ = xJ / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xB = xB / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; xEv = _mm_set1_ps(1.0 / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]); for (q = 0; q < Q; q++) { MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv); DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv); IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv); } } bck->xmx[L*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]; bck->totscale = log(bck->xmx[L*p7X_NXCELLS+p7X_SCALE]); /* Stores */ bck->xmx[L*p7X_NXCELLS+p7X_E] = xE; bck->xmx[L*p7X_NXCELLS+p7X_N] = xN; bck->xmx[L*p7X_NXCELLS+p7X_J] = xJ; bck->xmx[L*p7X_NXCELLS+p7X_B] = xB; bck->xmx[L*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, L, 9, 4, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=L, width=9, precision=4*/ #endif /* main recursion */ for (i = L-1; i >= 1; i--) /* backwards stride */ { /* phase 1. B(i) collected. Old row destroyed, new row contains * complete I(i,k), partial {MD}(i,k) w/ no {MD}->{DE} paths yet. */ dpc = bck->dpf[i * do_full]; dpp = bck->dpf[(i+1) * do_full]; rp = om->rfv[dsq[i+1]] + Q-1; /* <*rp> is now the [4 8 12 x] match emission quad */ tp = om->tfv + 7*Q - 1; /* <*tp> is now the [4 8 12 x] TII transition quad */ /* leftshift the first transition quads */ tmmv = _mm_move_ss(om->tfv[1], zerov); tmmv = _mm_shuffle_ps(tmmv, tmmv, _MM_SHUFFLE(0,3,2,1)); timv = _mm_move_ss(om->tfv[2], zerov); timv = _mm_shuffle_ps(timv, timv, _MM_SHUFFLE(0,3,2,1)); tdmv = _mm_move_ss(om->tfv[3], zerov); tdmv = _mm_shuffle_ps(tdmv, tdmv, _MM_SHUFFLE(0,3,2,1)); mpv = _mm_mul_ps(MMO(dpp,0), om->rfv[dsq[i+1]][0]); /* precalc M(i+1,k+1) * e(M_k+1, x_{i+1}) */ mpv = _mm_move_ss(mpv, zerov); mpv = _mm_shuffle_ps(mpv, mpv, _MM_SHUFFLE(0,3,2,1)); xBv = zerov; for (q = Q-1; q >= 0; q--) /* backwards stride */ { ipv = IMO(dpp,q); /* assumes emission odds ratio of 1.0; i+1's IMO(q) now free */ IMO(dpc,q) = _mm_add_ps(_mm_mul_ps(ipv, *tp), _mm_mul_ps(mpv, timv)); tp--; DMO(dpc,q) = _mm_mul_ps(mpv, tdmv); mcv = _mm_add_ps(_mm_mul_ps(ipv, *tp), _mm_mul_ps(mpv, tmmv)); tp-= 2; mpv = _mm_mul_ps(MMO(dpp,q), *rp); rp--; /* obtain mpv for next q. i+1's MMO(q) is freed */ MMO(dpc,q) = mcv; tdmv = *tp; tp--; timv = *tp; tp--; tmmv = *tp; tp--; xBv = _mm_add_ps(xBv, _mm_mul_ps(mpv, *tp)); tp--; } /* phase 2: now that we have accumulated the B->Mk transitions in xBv, we can do the specials */ /* this incantation is a horiz sum of xBv elements: (_mm_hadd_ps() would require SSE3) */ xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(0, 3, 2, 1))); xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(&xB, xBv); xC = xC * om->xf[p7O_C][p7O_LOOP]; xJ = (xB * om->xf[p7O_J][p7O_MOVE]) + (xJ * om->xf[p7O_J][p7O_LOOP]); /* must come after xB */ xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]); /* must come after xB */ xE = (xC * om->xf[p7O_E][p7O_MOVE]) + (xJ * om->xf[p7O_E][p7O_LOOP]); /* must come after xJ, xC */ xEv = _mm_set1_ps(xE); /* splat */ /* phase 3: {MD}->E paths and one step of the D->D paths */ tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ dpv = _mm_add_ps(DMO(dpc,0), xEv); dpv = _mm_move_ss(dpv, zerov); dpv = _mm_shuffle_ps(dpv, dpv, _MM_SHUFFLE(0,3,2,1)); for (q = Q-1; q >= 0; q--) { dcv = _mm_mul_ps(dpv, *tp); tp--; DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), _mm_add_ps(dcv, xEv)); dpv = DMO(dpc,q); MMO(dpc,q) = _mm_add_ps(MMO(dpc,q), xEv); } /* phase 4: finish extending the DD paths */ /* fully serialized for now */ for (j = 1; j < 4; j++) /* three passes: we've already done 1 segment, we need 4 total */ { dcv = _mm_move_ss(dcv, zerov); dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1)); tp = om->tfv + 8*Q - 1; /* <*tp> now the [4 8 12 x] TDD quad */ for (q = Q-1; q >= 0; q--) { dcv = _mm_mul_ps(dcv, *tp); tp--; DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), dcv); } } /* phase 5: add M->D paths */ dcv = _mm_move_ss(DMO(dpc,0), zerov); dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1)); tp = om->tfv + 7*Q - 3; /* <*tp> is now the [4 8 12 x] Mk->Dk+1 quad */ for (q = Q-1; q >= 0; q--) { MMO(dpc,q) = _mm_add_ps(MMO(dpc,q), _mm_mul_ps(dcv, *tp)); tp -= 7; dcv = DMO(dpc,q); } /* Sparse rescaling */ /* In rare cases [J3/119] scale factors from <fwd> are * insufficient and backwards will overflow. In this case, we * switch on the fly to using our own scale factors, different * from those in <fwd>. This will complicate subsequent * posterior decoding routines. */ if (xB > 1.0e16) bck->has_own_scales = TRUE; if (bck->has_own_scales) bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = (xB > 1.0e4) ? xB : 1.0; else bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[i*p7X_NXCELLS+p7X_SCALE]; if (bck->xmx[i*p7X_NXCELLS+p7X_SCALE] > 1.0) { xE /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xN /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xJ /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xB /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xC /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE]; xBv = _mm_set1_ps(1.0 / bck->xmx[i*p7X_NXCELLS+p7X_SCALE]); for (q = 0; q < Q; q++) { MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xBv); DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xBv); IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xBv); } bck->totscale += log(bck->xmx[i*p7X_NXCELLS+p7X_SCALE]); } /* Stores are separate only for pedagogical reasons: easy to * turn this into a more memory efficient version just by * deleting the stores. */ bck->xmx[i*p7X_NXCELLS+p7X_E] = xE; bck->xmx[i*p7X_NXCELLS+p7X_N] = xN; bck->xmx[i*p7X_NXCELLS+p7X_J] = xJ; bck->xmx[i*p7X_NXCELLS+p7X_B] = xB; bck->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, i, 9, 4, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=9, precision=4*/ #endif } /* thus ends the loop over sequence positions i */ /* Termination at i=0, where we can only reach N,B states. */ dpp = bck->dpf[1 * do_full]; tp = om->tfv; /* <*tp> is now the [1 5 9 13] TBMk transition quad */ rp = om->rfv[dsq[1]]; /* <*rp> is now the [1 5 9 13] match emission quad */ xBv = zerov; for (q = 0; q < Q; q++) { mpv = _mm_mul_ps(MMO(dpp,q), *rp); rp++; mpv = _mm_mul_ps(mpv, *tp); tp += 7; xBv = _mm_add_ps(xBv, mpv); } /* horizontal sum of xBv */ xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(0, 3, 2, 1))); xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(&xB, xBv); xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]); bck->xmx[p7X_B] = xB; bck->xmx[p7X_C] = 0.0; bck->xmx[p7X_J] = 0.0; bck->xmx[p7X_N] = xN; bck->xmx[p7X_E] = 0.0; bck->xmx[p7X_SCALE] = 1.0; #if p7_DEBUGGING dpc = bck->dpf[0]; for (q = 0; q < Q; q++) /* Not strictly necessary, but if someone's looking at DP matrices, this is nice to do: */ MMO(dpc,q) = DMO(dpc,q) = IMO(dpc,q) = zerov; if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, 0, 9, 4, bck->xmx[p7X_E], bck->xmx[p7X_N], bck->xmx[p7X_J], bck->xmx[p7X_B], bck->xmx[p7X_C]); /* logify=TRUE, <rowi>=0, width=9, precision=4*/ #endif if (isnan(xN)) ESL_EXCEPTION(eslERANGE, "backward score is NaN"); else if (L>0 && xN == 0.0) ESL_EXCEPTION(eslERANGE, "backward score underflow (is 0.0)"); /* if L==0, xN *should* be 0.0 [J5/118]*/ else if (isinf(xN) == 1) ESL_EXCEPTION(eslERANGE, "backward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = bck->totscale + log(xN); return eslOK; }
mlib_status mlib_ImageColorConvert2_F32( const mlib_f32 *src, mlib_s32 slb, mlib_f32 *dst, mlib_s32 dlb, mlib_s32 xsize, mlib_s32 ysize, const mlib_d64 *fmat, const mlib_d64 *offset) { /* pointers for pixel and line of source */ mlib_f32 *sa, *sl; /* pointers for pixel and line of destination */ mlib_f32 *da, *dl; /* indices */ mlib_s32 i, j; /* intermediate */ __m128 p0, p1, p2, t0, t1, t2, s0, s1, q; /* packed kernel */ __m128 k0, k1, k2; /* packed offset */ __m128 off; /* load transposed kernel */ k0 = _mm_set_ps(0.0f, (mlib_f32)fmat[6], (mlib_f32)fmat[3], (mlib_f32)fmat[0]); k1 = _mm_set_ps(0.0f, (mlib_f32)fmat[7], (mlib_f32)fmat[4], (mlib_f32)fmat[1]); k2 = _mm_set_ps(0.0f, (mlib_f32)fmat[8], (mlib_f32)fmat[5], (mlib_f32)fmat[2]); /* load offset */ off = _mm_set_ps(0.0f, (mlib_f32)offset[2], (mlib_f32)offset[1], (mlib_f32)offset[0]); sa = sl = (mlib_f32 *)src; da = dl = dst; for (j = 0; j < ysize; j++) { #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < (xsize - 1); i ++) { p0 = _mm_load1_ps(sa); sa ++; p1 = _mm_load1_ps(sa); sa ++; p2 = _mm_load1_ps(sa); sa ++; t0 = _mm_mul_ps(p0, k0); t1 = _mm_mul_ps(p1, k1); t2 = _mm_mul_ps(p2, k2); s0 = _mm_add_ps(t0, t1); s1 = _mm_add_ps(t2, off); q = _mm_add_ps(s0, s1); _mm_storeu_ps(da, q); da += 3; } /* * process the last pixel of each row separately * to avoid out of bound write */ p0 = _mm_load1_ps(sa); sa ++; p1 = _mm_load1_ps(sa); sa ++; p2 = _mm_load1_ps(sa); sa ++; t0 = _mm_mul_ps(p0, k0); t1 = _mm_mul_ps(p1, k1); t2 = _mm_mul_ps(p2, k2); s0 = _mm_add_ps(t0, t1); s1 = _mm_add_ps(t2, off); q = _mm_add_ps(s0, s1); _mm_storel_pi((__m64 *)da, q); da += 2; q = _mm_shuffle_ps(q, q, 0xaa); _mm_store_ss(da, q); /* set src pointer to next row */ sa = sl = sl + slb; /* set dst pointer to next row */ da = dl = dl + dlb; } return (MLIB_SUCCESS); }
// use MMX/SSE extensions void dotprod_cccf_execute_mmx4(dotprod_cccf _q, float complex * _x, float complex * _y) { // type cast input as floating point array float * x = (float*) _x; // double effective length unsigned int n = 2*_q->n; // first cut: ... __m128 v0, v1, v2, v3; // input vectors __m128 hi0, hi1, hi2, hi3; // coefficients vectors (real) __m128 hq0, hq1, hq2, hq3; // coefficients vectors (imag) __m128 ci0, ci1, ci2, ci3; // output multiplications (v * hi) __m128 cq0, cq1, cq2, cq3; // output multiplications (v * hq) // load zeros into sum registers __m128 sumi = _mm_setzero_ps(); __m128 sumq = _mm_setzero_ps(); // r = 4*floor(n/16) unsigned int r = (n >> 4) << 2; // unsigned int i; for (i=0; i<r; i+=4) { // load inputs into register (unaligned) v0 = _mm_loadu_ps(&x[4*i+0]); v1 = _mm_loadu_ps(&x[4*i+4]); v2 = _mm_loadu_ps(&x[4*i+8]); v3 = _mm_loadu_ps(&x[4*i+12]); // load real coefficients into registers (aligned) hi0 = _mm_load_ps(&_q->hi[4*i+0]); hi1 = _mm_load_ps(&_q->hi[4*i+4]); hi2 = _mm_load_ps(&_q->hi[4*i+8]); hi3 = _mm_load_ps(&_q->hi[4*i+12]); // load real coefficients into registers (aligned) hq0 = _mm_load_ps(&_q->hq[4*i+0]); hq1 = _mm_load_ps(&_q->hq[4*i+4]); hq2 = _mm_load_ps(&_q->hq[4*i+8]); hq3 = _mm_load_ps(&_q->hq[4*i+12]); // compute parallel multiplications (real) ci0 = _mm_mul_ps(v0, hi0); ci1 = _mm_mul_ps(v1, hi1); ci2 = _mm_mul_ps(v2, hi2); ci3 = _mm_mul_ps(v3, hi3); // compute parallel multiplications (imag) cq0 = _mm_mul_ps(v0, hq0); cq1 = _mm_mul_ps(v1, hq1); cq2 = _mm_mul_ps(v2, hq2); cq3 = _mm_mul_ps(v3, hq3); // accumulate sumi = _mm_add_ps(sumi, ci0); sumq = _mm_add_ps(sumq, cq0); sumi = _mm_add_ps(sumi, ci1); sumq = _mm_add_ps(sumq, cq1); sumi = _mm_add_ps(sumi, ci2); sumq = _mm_add_ps(sumq, cq2); sumi = _mm_add_ps(sumi, ci3); sumq = _mm_add_ps(sumq, cq3); } // shuffle values sumq = _mm_shuffle_ps( sumq, sumq, _MM_SHUFFLE(2,3,0,1) ); // unload float wi[4] __attribute__((aligned(16))); float wq[4] __attribute__((aligned(16))); _mm_store_ps(wi, sumi); _mm_store_ps(wq, sumq); // fold down (add/sub) float complex total = ((wi[0] - wq[0]) + (wi[2] - wq[2])) + ((wi[1] + wq[1]) + (wi[3] + wq[3])) * _Complex_I; // cleanup (note: n _must_ be even) // TODO : clean this method up for (i=2*r; i<_q->n; i++) { total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I ); } // set return value *_y = total; }
// Permute a vector's values across its components template <Byte X, Byte Y, Byte Z, Byte W> inline Vector VFunction Permute(const Vector& vector) { return _mm_shuffle_ps(vector, vector, _MM_SHUFFLE(W, Z, Y, X)); }
/*! * \brief Perform an horizontal sum of the given vector. * \param in The input vector type * \return the horizontal sum of the vector */ ETL_STATIC_INLINE(float) hadd(avx_simd_float in) { const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value)); const __m128 x64 = _mm_add_ps(x128, _mm_movehl_ps(x128, x128)); const __m128 x32 = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55)); return _mm_cvtss_f32(x32); }
/* ============ idSIMD_SSE::CmpLT dst[i] |= ( src0[i] < constant ) << bitNum; ============ */ void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) { int i, cnt, pre, post; float *aligned; __m128 xmm0, xmm1; __m128i xmm0i; int cnt_l; char *src0_p; char *constant_p; char *dst_p; int mask_l; int dst_l; /* if the float array is not aligned on a 4 byte boundary */ if ( ((int) src0) & 3 ) { /* unaligned memory access */ pre = 0; cnt = count >> 2; post = count - (cnt<<2); /* __asm mov edx, cnt __asm test edx, edx __asm je doneCmp */ cnt_l = cnt; if(cnt_l != 0) { /* __asm push ebx __asm neg edx __asm mov esi, src0 __asm prefetchnta [esi+64] __asm movss xmm1, constant __asm shufps xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ) __asm mov edi, dst __asm mov cl, bitNum */ cnt_l = -cnt_l; src0_p = (char *) src0; _mm_prefetch(src0_p+64, _MM_HINT_NTA); constant_p = (char *) &constant; xmm1 = _mm_load_ss((float *)constant_p); xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )); dst_p = (char *)dst; /* __asm loopNA: */ do { /* __asm movups xmm0, [esi] __asm prefetchnta [esi+128] __asm cmpltps xmm0, xmm1 __asm movmskps eax, xmm0 \ __asm mov ah, al __asm shr ah, 1 __asm mov bx, ax __asm shl ebx, 14 __asm mov bx, ax __asm and ebx, 0x01010101 __asm shl ebx, cl __asm or ebx, dword ptr [edi] __asm mov dword ptr [edi], ebx __asm add esi, 16 __asm add edi, 4 __asm inc edx __asm jl loopNA __asm pop ebx */ xmm0 = _mm_loadu_ps((float *) src0_p); _mm_prefetch(src0_p+128, _MM_HINT_NTA); xmm0 = _mm_cmplt_ps(xmm0, xmm1); // Simplify using SSE2 xmm0i = (__m128i) xmm0; xmm0i = _mm_packs_epi32(xmm0i, xmm0i); xmm0i = _mm_packs_epi16(xmm0i, xmm0i); mask_l = _mm_cvtsi128_si32(xmm0i); // End mask_l = mask_l & 0x01010101; mask_l = mask_l << bitNum; dst_l = *((int *) dst_p); mask_l = mask_l | dst_l; *((int *) dst_p) = mask_l; src0_p = src0_p + 16; dst_p = dst_p + 4; cnt_l = cnt_l + 1; } while (cnt_l < 0); } }
float Matrix4_M128::Inverse(Matrix4_M128 &mOut) const { __m128 Fac0; { __m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); Fac0 = _mm_sub_ps(Mul00, Mul01); } __m128 Fac1; { __m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); Fac1 = _mm_sub_ps(Mul00, Mul01); } __m128 Fac2; { __m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); Fac2 = _mm_sub_ps(Mul00, Mul01); } __m128 Fac3; { __m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); Fac3 = _mm_sub_ps(Mul00, Mul01); } __m128 Fac4; { __m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); Fac4 = _mm_sub_ps(Mul00, Mul01); } __m128 Fac5; { __m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0)); __m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Mul00 = _mm_mul_ps(Swp00, Swp01); __m128 Mul01 = _mm_mul_ps(Swp02, Swp03); Fac5 = _mm_sub_ps(Mul00, Mul01); } __m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f); __m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f); __m128 Temp0 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0)); __m128 Temp1 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(1, 1, 1, 1)); __m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0)); __m128 Temp2 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(2, 2, 2, 2)); __m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0)); __m128 Temp3 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(3, 3, 3, 3)); __m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0)); __m128 Mul00 = _mm_mul_ps(Vec1, Fac0); __m128 Mul01 = _mm_mul_ps(Vec2, Fac1); __m128 Mul02 = _mm_mul_ps(Vec3, Fac2); __m128 Sub00 = _mm_sub_ps(Mul00, Mul01); __m128 Add00 = _mm_add_ps(Sub00, Mul02); __m128 Inv0 = _mm_mul_ps(SignB, Add00); __m128 Mul03 = _mm_mul_ps(Vec0, Fac0); __m128 Mul04 = _mm_mul_ps(Vec2, Fac3); __m128 Mul05 = _mm_mul_ps(Vec3, Fac4); __m128 Sub01 = _mm_sub_ps(Mul03, Mul04); __m128 Add01 = _mm_add_ps(Sub01, Mul05); __m128 Inv1 = _mm_mul_ps(SignA, Add01); __m128 Mul06 = _mm_mul_ps(Vec0, Fac1); __m128 Mul07 = _mm_mul_ps(Vec1, Fac3); __m128 Mul08 = _mm_mul_ps(Vec3, Fac5); __m128 Sub02 = _mm_sub_ps(Mul06, Mul07); __m128 Add02 = _mm_add_ps(Sub02, Mul08); __m128 Inv2 = _mm_mul_ps(SignB, Add02); __m128 Mul09 = _mm_mul_ps(Vec0, Fac2); __m128 Mul10 = _mm_mul_ps(Vec1, Fac4); __m128 Mul11 = _mm_mul_ps(Vec2, Fac5); __m128 Sub03 = _mm_sub_ps(Mul09, Mul10); __m128 Add03 = _mm_add_ps(Sub03, Mul11); __m128 Inv3 = _mm_mul_ps(SignA, Add03); __m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0)); __m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0)); // Det0 = dot(C1, Row2) __m128 mul0 = _mm_mul_ps(C1, Row2); __m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1)); __m128 add0 = _mm_add_ps(mul0, swp0); __m128 swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3)); __m128 Det0 = _mm_add_ps(add0, swp1); __m128 Rcp0 = _mm_div_ps(VecOne, Det0); mOut.C1 = _mm_mul_ps(Inv0, Rcp0); mOut.C2 = _mm_mul_ps(Inv1, Rcp0); mOut.C3 = _mm_mul_ps(Inv2, Rcp0); mOut.C4 = _mm_mul_ps(Inv3, Rcp0); float retVal; _mm_store_ss(&retVal, Det0); return retVal; }
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) { segments_t* mem = (segments_t*) _bst_obj; int n, i, r, l_end, j, l_end_pre; double t, e_tmp; double* e = mem->e, *w = mem->w; int* root = mem->r; __m128d v_tmp; __m128d v00, v01, v02, v03; __m128d v10, v11, v12, v13; __m128d v20, v21, v22, v23; __m128d v30, v31, v32, v33; __m128i v_cur_roots; __m128 v_rootmask0, v_rootmask1; // initialization // mem->n = nn; n = nn; // subtractions with n potentially negative. say hello to all the bugs int idx1, idx2, idx3; idx1 = IDX(n,n); e[idx1] = q[n]; idx1++; for (i = n-1; i >= 0; --i) { idx1 -= 2*(n-i)+1; idx2 = idx1 + 1; e[idx1] = q[i]; w[idx1] = q[i]; for (j = i+1; j < n+1; ++j,++idx2) { e[idx2] = INFINITY; w[idx2] = w[idx2-1] + p[j-1] + q[j]; } idx3 = idx1; for (r = i; r < n; ++r) { // idx2 = IDX(r+1, r+1); idx1 = idx3; l_end = idx2 + (n-r); // l_end points to the first entry after the current row e_tmp = e[idx1++]; // calculate until a multiple of 8 doubles is left // 8 = 4 * 2 128-bit vectors l_end_pre = idx2 + ((n-r)&7); for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) { t = e_tmp + e[idx2] + w[idx1]; if (t < e[idx1]) { e[idx1] = t; root[idx1] = r; } idx1++; } v_tmp = _mm_set_pd( e_tmp, e_tmp ); // execute the shit for 4 vectors of size 2 v_cur_roots = _mm_set_epi32(r, r, r, r); for( ; idx2 < l_end; idx2 += 8 ) { v01 = _mm_loadu_pd( &w[idx1 ] ); v11 = _mm_loadu_pd( &w[idx1+2] ); v21 = _mm_loadu_pd( &w[idx1+4] ); v31 = _mm_loadu_pd( &w[idx1+6] ); v00 = _mm_loadu_pd( &e[idx2 ] ); v01 = _mm_add_pd( v01, v_tmp ); v10 = _mm_loadu_pd( &e[idx2+2] ); v11 = _mm_add_pd( v11, v_tmp ); v20 = _mm_loadu_pd( &e[idx2+4] ); v21 = _mm_add_pd( v21, v_tmp ); v30 = _mm_loadu_pd( &e[idx2+6] ); v31 = _mm_add_pd( v31, v_tmp ); v01 = _mm_add_pd( v01, v00 ); v03 = _mm_loadu_pd( &e[idx1 ] ); v11 = _mm_add_pd( v11, v10 ); v13 = _mm_loadu_pd( &e[idx1+2] ); v21 = _mm_add_pd( v21, v20 ); v23 = _mm_loadu_pd( &e[idx1+4] ); v31 = _mm_add_pd( v31, v30 ); v33 = _mm_loadu_pd( &e[idx1+6] ); v02 = _mm_cmplt_pd( v01, v03 ); v12 = _mm_cmplt_pd( v11, v13 ); v22 = _mm_cmplt_pd( v21, v23 ); v32 = _mm_cmplt_pd( v31, v33 ); _mm_maskstore_pd( &e[idx1 ], _mm_castpd_si128( v02 ), v01 ); _mm_maskstore_pd( &e[idx1+2], _mm_castpd_si128( v12 ), v11 ); _mm_maskstore_pd( &e[idx1+4], _mm_castpd_si128( v22 ), v21 ); _mm_maskstore_pd( &e[idx1+6], _mm_castpd_si128( v32 ), v31 ); v_rootmask0 = _mm_shuffle_ps( _mm_castpd_ps( v02 ), _mm_castpd_ps( v12 ), _MM_SHUFFLE(0,2,0,2) ); v_rootmask1 = _mm_shuffle_ps( _mm_castpd_ps( v12 ), _mm_castpd_ps( v22 ), _MM_SHUFFLE(0,2,0,2) ); _mm_maskstore_ps( &root[idx1], _mm_castps_si128( v_rootmask0 ), _mm_castsi128_ps( v_cur_roots ) ); _mm_maskstore_ps( &root[idx1+4], _mm_castps_si128( v_rootmask1 ), _mm_castsi128_ps( v_cur_roots ) ); idx1 += 8; } idx3++; } } return e[IDX(0,n)]; }
void mexFunction(int nlhs, mxArray *plhs[], int nrhs, const mxArray *prhs[]) { const float * kf = coeff; float * src = _src; float * dst = _dst; int i = 0, k, nz = length; // float delta = 0.000001f; __m128 d4 = _mm_setzero_ps(); float * S; __m128 s0, s1, s2, s3, t0, t1, t2, t3; __m128 f; for(i = 0; i <= width - 16; i += 16 ) { s0 = d4, s1 = d4, s2 = d4, s3 = d4; for( k = 0; k < nz; k++ ) { f = _mm_load_ss(kf + k); f = _mm_shuffle_ps(f, f, 0); // (__m128 f, __m128 f, unsigned int imm8) S = src + i + k; t0 = _mm_loadu_ps(S); t1 = _mm_loadu_ps(S + 4); s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f)); t0 = _mm_loadu_ps(S + 8); t1 = _mm_loadu_ps(S + 12); s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f)); s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f)); } _mm_storeu_ps(dst + i, s0); _mm_storeu_ps(dst + i + 4, s1); _mm_storeu_ps(dst + i + 8, s2); _mm_storeu_ps(dst + i + 12, s3); } // for( ; i <= width - 4; i += 4 ) { s0 = d4; for( k = 0; k < nz; k++ ) { f = _mm_load_ss(kf + k); f = _mm_shuffle_ps(f, f, 0); t0 = _mm_loadu_ps(src + k + i); s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f)); } _mm_storeu_ps(dst + i, s0); } for (; i < width; i++) { for( k = 0; k < nz; k++ ) { *(dst + i) += *(src + i + k) * *(kf + k); } } return; }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data; const int ch = piece->colors; if(!isnan(d->cmatrix[0])) { //fprintf(stderr,"Using cmatrix codepath\n"); // convert to rgb using matrix #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + ch*roi_in->width *j; float *out = (float*)ovoid + ch*roi_out->width*j; const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]); const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]); const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]); for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in)); const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2))))); _mm_stream_ps(out,t); } } _mm_sfence(); // apply profile #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + ch*roi_in->width *j; float *out = (float*)ovoid + ch*roi_out->width*j; for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { for(int i=0; i<3; i++) if (d->lut[i][0] >= 0.0f) { out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]); } } } } else { float *in = (float*)ivoid; float *out = (float*)ovoid; const int rowsize=roi_out->width * 3; //fprintf(stderr,"Using xform codepath\n"); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(out, roi_out, in) #endif for (int k=0; k<roi_out->height; k++) { float Lab[rowsize]; float rgb[rowsize]; const int m=(k*(roi_out->width*ch)); for (int l=0; l<roi_out->width; l++) { int li=3*l,ii=ch*l; Lab[li+0] = in[m+ii+0]; Lab[li+1] = in[m+ii+1]; Lab[li+2] = in[m+ii+2]; } cmsDoTransform (d->xform, Lab, rgb, roi_out->width); for (int l=0; l<roi_out->width; l++) { int oi=ch*l, ri=3*l; out[m+oi+0] = rgb[ri+0]; out[m+oi+1] = rgb[ri+1]; out[m+oi+2] = rgb[ri+2]; } } } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf, const uint8* u_buf, const uint8* v_buf, uint8* rgb_buf, int width, int source_dx) { __m128i xmm0, xmmY1, xmmY2; __m128 xmmY; uint8 u0, u1, v0, v1, y0, y1; uint32 uv_frac, y_frac, u, v, y; int x = 0; if (source_dx >= 0x20000) { x = 32768; } while(width >= 2) { u0 = u_buf[x >> 17]; u1 = u_buf[(x >> 17) + 1]; v0 = v_buf[x >> 17]; v1 = v_buf[(x >> 17) + 1]; y0 = y_buf[x >> 16]; y1 = y_buf[(x >> 16) + 1]; uv_frac = (x & 0x1fffe); y_frac = (x & 0xffff); u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17; v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17; y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; x += source_dx; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); y0 = y_buf[x >> 16]; y1 = y_buf[(x >> 16) + 1]; y_frac = (x & 0xffff); y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16; x += source_dx; xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY2 = _mm_adds_epi16(xmmY2, xmm0); xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2), 0x44); xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1); rgb_buf += 8; width -= 2; } if (width) { u = u_buf[x >> 17]; v = v_buf[x >> 17]; y = y_buf[x >> 16]; xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)), _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v))); xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y)); xmmY1 = _mm_adds_epi16(xmmY1, xmm0); xmmY1 = _mm_srai_epi16(xmmY1, 6); xmmY1 = _mm_packus_epi16(xmmY1, xmmY1); *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1); } }
void BrushToolEdit::drawBlur(const QPoint &pt, float amount) { Terrain *tip = tool->tip(pt).data(); // compute affected rectangle QRect dirtyRect(pt, tip->size()); dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size())); if (!dirtyRect.isValid()) { return; } edit->beginEdit(dirtyRect, terrain); QSize tSize = terrain->size(); QSize tipSize = tip->size(); QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6); TemporaryBuffer<__m128> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 16); TemporaryBuffer<__m128> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 16); TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4); for (int y = 0; y < blurBufferSize.height(); ++y) { int cy = y + dirtyRect.top() - 3; cy = std::max(std::min(cy, tSize.height() - 1), 0); for (int x = 0; x < blurBufferSize.width(); ++x) { int cx = x + dirtyRect.left() - 3; cx = std::max(std::min(cx, tSize.width() - 1), 0); quint32 color = terrain->color(cx, cy); auto colorMM = _mm_setr_epi32(color, 0, 0, 0); colorMM = _mm_unpacklo_epi8(colorMM, _mm_setzero_si128()); colorMM = _mm_unpacklo_epi16(colorMM, _mm_setzero_si128()); auto colorF = _mm_cvtepi32_ps(colorMM); _mm_store_ps(reinterpret_cast<float *>(blurBuffer1 + x + y * blurBufferSize.width()), colorF); } } for (int y = 0; y < blurBufferSize.height(); ++y) { int cy = y + dirtyRect.top() - 3; int ty = cy - pt.y(); if (ty >= 0 && ty < tipSize.height()) { for (int x = 0; x < blurBufferSize.width(); ++x) { int cx = x + dirtyRect.left() - 3; int tx = cx - pt.x(); tipBuffer[x + y * blurBufferSize.width()] = tx >= 0 && tx < tipSize.width() ? tip->landform(tx, ty) * amount : 0.f; } } else { std::fill(&tipBuffer[y * blurBufferSize.width()], &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f); } } // apply horizontal blur for (int y = 0; y < blurBufferSize.height(); ++y) { __m128 *inBuf = blurBuffer1 + y * blurBufferSize.width(); __m128 *outBuf = blurBuffer2 + y * blurBufferSize.width(); float *varBuf = tipBuffer + y * blurBufferSize.width(); for (int x = 3; x < blurBufferSize.width() - 3; ++x) { float variance = varBuf[x]; __m128 kernel = globalGaussianKernelTable.fetch(variance); // sample input __m128 p1 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x)); p1 = _mm_add_ps(p1, p1); __m128 p2 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 1)); p2 = _mm_add_ps(p2, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 1))); __m128 p3 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 2)); p3 = _mm_add_ps(p3, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 2))); __m128 p4 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 3)); p4 = _mm_add_ps(p4, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 3))); // apply kernel p1 = _mm_mul_ps(p1, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(0, 0, 0, 0))); p2 = _mm_mul_ps(p2, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(1, 1, 1, 1))); p3 = _mm_mul_ps(p3, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(2, 2, 2, 2))); p4 = _mm_mul_ps(p4, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(3, 3, 3, 3))); p1 = _mm_add_ps(p1, p2); p3 = _mm_add_ps(p3, p4); auto p = _mm_add_ps(p1, p3); // store _mm_store_ps(reinterpret_cast<float *>(outBuf + x), p); } } // apply vertical blur for (int y = 3; y < blurBufferSize.height() - 3; ++y) { __m128 *inBuf = blurBuffer2 + y * blurBufferSize.width(); __m128 *outBuf = blurBuffer1 + y * blurBufferSize.width(); float *varBuf = tipBuffer + y * blurBufferSize.width(); for (int x = 3; x < blurBufferSize.width() - 3; x += 1) { // fetch kernel __m128 kernel = globalGaussianKernelTable.fetch(varBuf[x]); // load input __m128 p1 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x)); p1 = _mm_add_ps(p1, p1); __m128 p2 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width())); p2 = _mm_add_ps(p2, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width()))); __m128 p3 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width() * 2)); p3 = _mm_add_ps(p3, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width() * 2))); __m128 p4 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width() * 3)); p4 = _mm_add_ps(p4, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width() * 3))); // apply kernel p1 = _mm_mul_ps(p1, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(0, 0, 0, 0))); p2 = _mm_mul_ps(p2, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(1, 1, 1, 1))); p3 = _mm_mul_ps(p3, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(2, 2, 2, 2))); p4 = _mm_mul_ps(p4, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(3, 3, 3, 3))); p1 = _mm_add_ps(p1, p2); p3 = _mm_add_ps(p3, p4); auto p = _mm_add_ps(p1, p3); // store _mm_store_ps(reinterpret_cast<float *>(outBuf + x), p); } } for (int y = 0; y < dirtyRect.height(); ++y) { __m128 *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3; for (int x = 0; x < dirtyRect.width(); ++x) { int cx = x + dirtyRect.left(); int cy = y + dirtyRect.top(); auto colorF = _mm_load_ps(reinterpret_cast<float *>(inBuf + x)); colorF = _mm_add_ps(colorF, _mm_set1_ps(0.5f)); colorF = _mm_add_ps(colorF, globalDitherSampler.getM128()); auto colorMM = _mm_cvttps_epi32(colorF); colorMM = _mm_packs_epi32(colorMM, colorMM); colorMM = _mm_packus_epi16(colorMM, colorMM); _mm_store_ss(reinterpret_cast<float*>(&terrain->color(cx, cy)), _mm_castsi128_ps(colorMM)); } } edit->endEdit(terrain); }
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer) { unsigned i; __m128 sum; __m128 sum_l = _mm_setzero_ps(); __m128 sum_r = _mm_setzero_ps(); const float *buffer_l = resamp->buffer_l + resamp->ptr; const float *buffer_r = resamp->buffer_r + resamp->ptr; unsigned taps = resamp->taps; unsigned phase = resamp->time >> SUBPHASE_BITS; #if SINC_COEFF_LERP const float *phase_table = resamp->phase_table + phase * taps * 2; const float *delta_table = phase_table + taps; __m128 delta = _mm_set1_ps((float) (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD); #else const float *phase_table = resamp->phase_table + phase * taps; #endif for (i = 0; i < taps; i += 4) { __m128 buf_l = _mm_loadu_ps(buffer_l + i); __m128 buf_r = _mm_loadu_ps(buffer_r + i); #if SINC_COEFF_LERP __m128 deltas = _mm_load_ps(delta_table + i); __m128 _sinc = _mm_add_ps(_mm_load_ps(phase_table + i), _mm_mul_ps(deltas, delta)); #else __m128 _sinc = _mm_load_ps(phase_table + i); #endif sum_l = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc)); sum_r = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc)); } /* Them annoying shuffles. * sum_l = { l3, l2, l1, l0 } * sum_r = { r3, r2, r1, r0 } */ sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2))); /* sum = { r1, r0, l1, l0 } + { r3, r2, l3, l2 } * sum = { R1, R0, L1, L0 } */ sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum); /* sum = {R1, R1, L1, L1 } + { R1, R0, L1, L0 } * sum = { X, R, X, L } */ /* Store L */ _mm_store_ss(out_buffer + 0, sum); /* movehl { X, R, X, L } == { X, R, X, R } */ _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum)); }
void decomp_gamma2_minus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm6; __m128 xmm7; __m128 xmm8; /* Swap upper and lower components */ /* Compiler should spill, or use 64 bit extras */ __m128 xmm9; __m128 xmm10; __m128 xmm11; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* Swap the lower components */ xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1); xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1); xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1); xmm9 = _mm_xor_ps(xmm6, signs23.vector); xmm10 = _mm_xor_ps(xmm7, signs23.vector); xmm11 = _mm_xor_ps(xmm8, signs23.vector); /* Add */ xmm0 = _mm_add_ps(xmm0, xmm9); xmm1 = _mm_add_ps(xmm1, xmm10); xmm2 = _mm_add_ps(xmm2, xmm11); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
test (__m128 s1, __m128 s2) { return _mm_shuffle_ps (s1, s2, MASK); }
void decomp_gamma3_minus( spinor_array src, halfspinor_array dst) { /* Space for upper components */ __m128 xmm0; __m128 xmm1; __m128 xmm2; /* Space for lower components */ __m128 xmm3; __m128 xmm4; __m128 xmm5; __m128 xmm6; __m128 xmm7; xmm0 = _mm_load_ps(&src[0][0][0]); xmm2 = _mm_load_ps(&src[0][2][0]); xmm6 = _mm_load_ps(&src[1][1][0]); xmm3 = _mm_load_ps(&src[2][0][0]); xmm5 = _mm_load_ps(&src[2][2][0]); xmm7 = _mm_load_ps(&src[3][1][0]); xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero xmm4 = _mm_xor_ps(xmm4,xmm4); xmm1 = _mm_movelh_ps(xmm1,xmm6); xmm4 = _mm_movelh_ps(xmm4,xmm7); xmm1 = _mm_movehl_ps(xmm1, xmm0); xmm4 = _mm_movehl_ps(xmm4, xmm3); xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4); xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4); xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4); xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4); #if 0 /* Load up the spinors */ xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]); xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]); xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]); xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]); xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]); xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]); xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]); xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]); xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]); xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]); xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]); xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]); #endif /* sub */ xmm0 = _mm_sub_ps(xmm0, xmm3); xmm1 = _mm_sub_ps(xmm1, xmm4); xmm2 = _mm_sub_ps(xmm2, xmm5); /* Store */ _mm_store_ps(&dst[0][0][0],xmm0); _mm_store_ps(&dst[1][0][0],xmm1); _mm_store_ps(&dst[2][0][0],xmm2); }
/** 32 point butterfly (in place, 4 register) */ static void mdct_butterfly_32_sse(FLOAT *x) { static _MM_ALIGN16 const float PFV0[4] = { -AFT_PI3_8, -AFT_PI1_8, -AFT_PI2_8, -AFT_PI2_8 }; static _MM_ALIGN16 const float PFV1[4] = { -AFT_PI1_8, AFT_PI3_8, -AFT_PI2_8, AFT_PI2_8 }; static _MM_ALIGN16 const float PFV2[4] = { -AFT_PI1_8, -AFT_PI3_8, -1.f, 1.f }; static _MM_ALIGN16 const float PFV3[4] = { -AFT_PI3_8, AFT_PI1_8, 0.f, 0.f }; static _MM_ALIGN16 const float PFV4[4] = { AFT_PI3_8, AFT_PI3_8, AFT_PI2_8, AFT_PI2_8 }; static _MM_ALIGN16 const float PFV5[4] = { -AFT_PI1_8, AFT_PI1_8, -AFT_PI2_8, AFT_PI2_8 }; static _MM_ALIGN16 const float PFV6[4] = { AFT_PI1_8, AFT_PI3_8, 1.f, 1.f }; static _MM_ALIGN16 const float PFV7[4] = { -AFT_PI3_8, AFT_PI1_8, 0.f, 0.f }; __m128 XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7; XMM0 = _mm_load_ps(x+16); XMM1 = _mm_load_ps(x+20); XMM2 = _mm_load_ps(x+24); XMM3 = _mm_load_ps(x+28); XMM4 = XMM0; XMM5 = XMM1; XMM6 = XMM2; XMM7 = XMM3; XMM0 = _mm_sub_ps(XMM0, PM128(x )); XMM1 = _mm_sub_ps(XMM1, PM128(x+ 4)); XMM2 = _mm_sub_ps(XMM2, PM128(x+ 8)); XMM3 = _mm_sub_ps(XMM3, PM128(x+12)); XMM4 = _mm_add_ps(XMM4, PM128(x )); XMM5 = _mm_add_ps(XMM5, PM128(x+ 4)); XMM6 = _mm_add_ps(XMM6, PM128(x+ 8)); XMM7 = _mm_add_ps(XMM7, PM128(x+12)); _mm_store_ps(x+16, XMM4); _mm_store_ps(x+20, XMM5); _mm_store_ps(x+24, XMM6); _mm_store_ps(x+28, XMM7); XMM4 = XMM0; XMM5 = XMM1; XMM6 = XMM2; XMM7 = XMM3; XMM0 = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(3,3,1,1)); XMM4 = _mm_shuffle_ps(XMM4, XMM4, _MM_SHUFFLE(2,2,0,0)); XMM1 = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(2,3,1,1)); XMM5 = _mm_shuffle_ps(XMM5, XMM5, _MM_SHUFFLE(2,3,0,0)); XMM2 = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,2,1,0)); XMM6 = _mm_shuffle_ps(XMM6, XMM6, _MM_SHUFFLE(3,3,0,1)); XMM3 = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(3,2,0,0)); XMM7 = _mm_shuffle_ps(XMM7, XMM7, _MM_SHUFFLE(3,2,1,1)); XMM0 = _mm_mul_ps(XMM0, PM128(PFV0)); XMM4 = _mm_mul_ps(XMM4, PM128(PFV1)); XMM1 = _mm_mul_ps(XMM1, PM128(PFV2)); XMM5 = _mm_mul_ps(XMM5, PM128(PFV3)); XMM2 = _mm_mul_ps(XMM2, PM128(PFV4)); XMM6 = _mm_mul_ps(XMM6, PM128(PFV5)); XMM3 = _mm_mul_ps(XMM3, PM128(PFV6)); XMM7 = _mm_mul_ps(XMM7, PM128(PFV7)); XMM0 = _mm_add_ps(XMM0, XMM4); XMM1 = _mm_add_ps(XMM1, XMM5); XMM2 = _mm_add_ps(XMM2, XMM6); XMM3 = _mm_add_ps(XMM3, XMM7); _mm_store_ps(x , XMM0); _mm_store_ps(x+ 4, XMM1); _mm_store_ps(x+ 8, XMM2); _mm_store_ps(x+12, XMM3); mdct_butterfly_16_sse(x); mdct_butterfly_16_sse(x+16); }