示例#1
0
static int
forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc)
{
  register __m128 mpv, dpv, ipv;   /* previous row values                                       */
  register __m128 sv;		   /* temp storage of 1 curr row value in progress              */
  register __m128 dcv;		   /* delayed storage of D(i,q+1)                               */
  register __m128 xEv;		   /* E state: keeps max for Mk->E as we go                     */
  register __m128 xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  __m128   zerov;		   /* splatted 0.0's in a vector                                */
  float    xN, xE, xB, xC, xJ;	   /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over quads 0..nq-1                                */
  int j;			   /* counter over DD iterations (4 is full serialization)      */
  int Q       = p7O_NQF(om->M);	   /* segment length: # of vectors                              */
  __m128 *dpc = ox->dpf[0];        /* current row, for use in {MDI}MO(dpp,q) access macro       */
  __m128 *dpp;                     /* previous row, for use in {MDI}MO(dpp,q) access macro      */
  __m128 *rp;			   /* will point at om->rfv[x] for residue x[i]                 */
  __m128 *tp;			   /* will point into (and step thru) om->tfv                   */

  /* Initialization. */
  ox->M  = om->M;
  ox->L  = L;
  ox->has_own_scales = TRUE; 	/* all forward matrices control their own scalefactors */
  zerov  = _mm_setzero_ps();
  for (q = 0; q < Q; q++)
    MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov;
  xE    = ox->xmx[p7X_E] = 0.;
  xN    = ox->xmx[p7X_N] = 1.;
  xJ    = ox->xmx[p7X_J] = 0.;
  xB    = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE];
  xC    = ox->xmx[p7X_C] = 0.;

  ox->xmx[p7X_SCALE] = 1.0;
  ox->totscale       = 0.0;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=0, width=8, precision=5*/
#endif

  for (i = 1; i <= L; i++)
    {
      dpp   = dpc;                      
      dpc   = ox->dpf[do_full * i];     /* avoid conditional, use do_full as kronecker delta */
      rp    = om->rfv[dsq[i]];
      tp    = om->tfv;
      dcv   = _mm_setzero_ps();
      xEv   = _mm_setzero_ps();
      xBv   = _mm_set1_ps(xB);

      /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12.  Shift zeros on. */
      mpv   = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov);
      dpv   = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov);
      ipv   = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov);
      
      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */
	  sv   =                _mm_mul_ps(xBv, *tp);  tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++;
	  sv   = _mm_mul_ps(sv, *rp);                  rp++;
	  xEv  = _mm_add_ps(xEv, sv);
	  
	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMO(dpp,q);
	  dpv = DMO(dpp,q);
	  ipv = IMO(dpp,q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMO(dpc,q) = sv;
	  DMO(dpc,q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = _mm_mul_ps(sv, *tp); tp++;

	  /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */
	  sv         =                _mm_mul_ps(mpv, *tp);  tp++;
	  IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	}	  

      /* Now the DD paths. We would rather not serialize them but 
       * in an accurate Forward calculation, we have few options.
       */
      /* dcv has carried through from end of q loop above; store it 
       * in first pass, we add M->D and D->D path into DMX
       */
      /* We're almost certainly're obligated to do at least one complete 
       * DD path to be sure: 
       */
      dcv        = esl_sse_rightshift_ps(dcv, zerov);
      DMO(dpc,0) = zerov;
      tp         = om->tfv + 7*Q;	/* set tp to start of the DD's */
      for (q = 0; q < Q; q++) 
	{
	  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
	  dcv        = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */
	}

      /* now. on small models, it seems best (empirically) to just go
       * ahead and serialize. on large models, we can do a bit better,
       * by testing for when dcv (DD path) accrued to DMO(q) is below
       * machine epsilon for all q, in which case we know DMO(q) are all
       * at their final values. The tradeoff point is (empirically) somewhere around M=100,
       * at least on my desktop. We don't worry about the conditional here;
       * it's outside any inner loops.
       */
      if (om->M < 100)
	{			/* Fully serialized version */
	  for (j = 1; j < 4; j++)
	    {
	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      for (q = 0; q < Q; q++) 
		{ /* note, extend dcv, not DMO(q); only adding DD paths now */
		  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++; 
		}	    
	    }
	} 
      else
	{			/* Slightly parallelized version, but which incurs some overhead */
	  for (j = 1; j < 4; j++)
	    {
	      register __m128 cv;	/* keeps track of whether any DD's change DMO(q) */

	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      cv  = zerov;
	      for (q = 0; q < Q; q++) 
		{ /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */
		  sv         = _mm_add_ps(dcv, DMO(dpc,q));	
		  cv         = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); 
		  DMO(dpc,q) = sv;	                                    /* store new DMO(q) */
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++;            /* note, extend dcv, not DMO(q) */
		}	    
	      if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */
	    }
	}

      /* Add D's to xEv */
      for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv);

      /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */
      /* The following incantation is a horizontal sum of xEv's elements  */
      /* These must follow DD calculations, because D's contribute to E in Forward
       * (as opposed to Viterbi)
       */
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1)));
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2)));
      _mm_store_ss(&xE, xEv);

      xN =  xN * om->xf[p7O_N][p7O_LOOP];
      xC = (xC * om->xf[p7O_C][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_MOVE]);
      xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_LOOP]);
      xB = (xJ * om->xf[p7O_J][p7O_MOVE]) +  (xN * om->xf[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Sparse rescaling. xE above threshold? trigger a rescaling event.            */
      if (xE > 1.0e4)	/* that's a little less than e^10, ~10% of our dynamic range */
	{
	  xN  = xN / xE;
	  xC  = xC / xE;
	  xJ  = xJ / xE;
	  xB  = xB / xE;
	  xEv = _mm_set1_ps(1.0 / xE);
	  for (q = 0; q < Q; q++)
	    {
	      MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv);
	      DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv);
	      IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv);
	    }
	  ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE;
	  ox->totscale += log(xE);
	  xE = 1.0;		
	}
      else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0;

      /* Storage of the specials.  We could've stored these already
       * but using xE, etc. variables makes it easy to convert this
       * code to O(M) memory versions just by deleting storage steps.
       */
      ox->xmx[i*p7X_NXCELLS+p7X_E] = xE;
      ox->xmx[i*p7X_NXCELLS+p7X_N] = xN;
      ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ;
      ox->xmx[i*p7X_NXCELLS+p7X_B] = xB;
      ox->xmx[i*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=i, width=8, precision=5*/
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T, and flip total score back to log space (nats) */
  /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */
  /* On an underflow (which shouldn't happen), we counterintuitively return infinity:
   * the effect of this is to force the caller to rescore us with full range.
   */
  if       (isnan(xC))        ESL_EXCEPTION(eslERANGE, "forward score is NaN");
  else if  (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)");     /* if L==0, xC *should* be 0.0; J5/118 */
  else if  (isinf(xC) == 1)   ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)");

  if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]);
  return eslOK;
}
示例#2
0
void BrushToolEdit::drawSmoothen(const QPoint &pt, float amount)
{
    Terrain *tip = tool->tip(pt).data();

    // compute affected rectangle
    QRect dirtyRect(pt, tip->size());
    dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size()));

    if (!dirtyRect.isValid()) {
        return;
    }

    edit->beginEdit(dirtyRect, terrain);

    QSize tSize = terrain->size();
    QSize tipSize = tip->size();
    QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6);
    TemporaryBuffer<float> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 4);
    TemporaryBuffer<float> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 4);
    TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4);

    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        cy = std::max(std::min(cy, tSize.height() - 1), 0);
        for (int x = 0; x < blurBufferSize.width(); ++x) {
            int cx = x + dirtyRect.left() - 3;
            cx = std::max(std::min(cx, tSize.width() - 1), 0);
            blurBuffer1[x + y * blurBufferSize.width()] =
                    terrain->landform(cx, cy);
        }
    }
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        int ty = cy - pt.y();
        if (ty >= 0 && ty < tipSize.height()) {
            for (int x = 0; x < blurBufferSize.width(); ++x) {
                int cx = x + dirtyRect.left() - 3;
                int tx = cx - pt.x();
                tipBuffer[x + y * blurBufferSize.width()] =
                        tx >= 0 && tx < tipSize.width() ?
                            tip->landform(tx, ty) * amount :
                            0.f;
            }
        } else {
            std::fill(&tipBuffer[y * blurBufferSize.width()],
                    &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f);
        }
    }

    // apply horizontal blur
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        float *inBuf = blurBuffer1 + y * blurBufferSize.width();
        float *outBuf = blurBuffer2 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 3; x < blurBufferSize.width() - 3; ++x) {
            float variance = varBuf[x];
            __m128 kernel = globalGaussianKernelTable.fetch(variance);

            // sample input
            __m128 p1 = _mm_loadu_ps(inBuf + x - 3);
            __m128 p2 = _mm_loadu_ps(inBuf + x);
            p1 = _mm_shuffle_ps(p1, p1, _MM_SHUFFLE(0, 1, 2, 3));
            auto p = _mm_add_ps(p1, p2);

            // apply kernel
            p = _mm_mul_ps(p, kernel);
            p = _mm_hadd_ps(p, p);
            p = _mm_hadd_ps(p, p);

            // write
            _mm_store_ss(outBuf + x, p);
        }
    }

    // apply vertical blur
    for (int y = 3; y < blurBufferSize.height() - 3; ++y) {
        float *inBuf = blurBuffer2 + y * blurBufferSize.width();
        float *outBuf = blurBuffer1 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 0; x < blurBufferSize.width() - 3; x += 4) {
            // fetch kernel
            __m128 kernel1 = globalGaussianKernelTable.fetch(varBuf[x]);
            __m128 kernel2 = globalGaussianKernelTable.fetch(varBuf[x + 1]);
            __m128 kernel3 = globalGaussianKernelTable.fetch(varBuf[x + 2]);
            __m128 kernel4 = globalGaussianKernelTable.fetch(varBuf[x + 3]);
            _MM_TRANSPOSE4_PS(kernel1, kernel2, kernel3, kernel4);

            // load input
            __m128 p1 = _mm_loadu_ps(inBuf + x);
            p1 = _mm_add_ps(p1, p1);
            __m128 p2 = _mm_loadu_ps(inBuf + x - blurBufferSize.width());
            p2 = _mm_add_ps(p2, _mm_loadu_ps(inBuf + x + blurBufferSize.width()));
            __m128 p3 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 2);
            p3 = _mm_add_ps(p3, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 2));
            __m128 p4 = _mm_loadu_ps(inBuf + x - blurBufferSize.width() * 3);
            p4 = _mm_add_ps(p4, _mm_loadu_ps(inBuf + x + blurBufferSize.width() * 3));

            // apply kernel
            p1 = _mm_mul_ps(p1, kernel1);
            p2 = _mm_mul_ps(p2, kernel2);
            p3 = _mm_mul_ps(p3, kernel3);
            p4 = _mm_mul_ps(p4, kernel4);
            p1 = _mm_add_ps(p1, p2);
            p3 = _mm_add_ps(p3, p4);
            auto p = _mm_add_ps(p1, p3);

            // store
            _mm_storeu_ps(outBuf + x, p);
        }
    }

    for (int y = 0; y < dirtyRect.height(); ++y) {
        float *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3;
        for (int x = 0; x < dirtyRect.width(); ++x) {
            int cx = x + dirtyRect.left();
            int cy = y + dirtyRect.top();
            terrain->landform(cx, cy) = inBuf[x];
        }
    }
    edit->endEdit(terrain);
}
void decomp_gamma0_minus( spinor_array src, halfspinor_array dst) 
{

  /* c <-> color, s <-> spin */

  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm6;
  __m128 xmm7;
  __m128 xmm8;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm9;
  __m128 xmm10;
  __m128 xmm11;



  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

 
  /* Swap the lower components  and multiply by -i*/
  xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0x1b);
  xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0x1b);
  xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0x1b);


  xmm9 = _mm_xor_ps(xmm6, signs24.vector);
  xmm10 = _mm_xor_ps(xmm7, signs24.vector);
  xmm11 = _mm_xor_ps(xmm8, signs24.vector);


  /* Add */
  xmm0 = _mm_add_ps(xmm0, xmm9);
  xmm1 = _mm_add_ps(xmm1, xmm10);
  xmm2 = _mm_add_ps(xmm2, xmm11);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);
  
  
}
示例#4
0
__m128 addsubps(__m128 x, __m128 y){
  __m128 a = _mm_add_ps(x,y);
  __m128 b = _mm_sub_ps(x,y);
  a=_mm_shuffle_ps(a,b,_MM_SHUFFLE(0,2,1,3));
  return _mm_shuffle_ps(a,a,_MM_SHUFFLE());
}
示例#5
0
文件: sort.hpp 项目: francescog/nt2
  struct call<tag::sort_(tag::simd_<tag::type32_, tag::sse_> ),
              tag::cpu_, Dummy> : callable
  {
    template<class Sig> struct result;
    template<class This,class A0>
    struct result<This(A0)>
      : meta::strip<A0>{};//

    NT2_FUNCTOR_CALL(1)
    {
      typedef typename meta::as_real<A0>::type flt;
      A0 a =  {a0};
      A0 b =  {NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, a0), NT2_CAST(flt, a0)))};
      comp(a, b);
      a = NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, a), NT2_CAST(flt, b)));
      b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(1, 3, 1, 3)));
      comp(a, b);
      A0 c = {NT2_CAST(A0, _mm_movelh_ps(NT2_CAST(flt, b), NT2_CAST(flt, b)))};
      A0 d = {a};
      comp(c, d);
      a = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, c), NT2_CAST(flt, a), NT2_SH(3, 2, 0, 0)));
      b = NT2_CAST(A0, _mm_movehl_ps(NT2_CAST(flt, b), NT2_CAST(flt, d)));
      b = NT2_CAST(A0, _mm_shuffle_ps(NT2_CAST(flt, a), NT2_CAST(flt, b), NT2_SH(3, 1, 0, 2)));
      return b;
    }
  private :
    template < class T > static inline void comp(T & a,T & b)
    {
      T c =  nt2::min(a, b);
      b = nt2::max(a, b);
      a = c;
示例#6
0
文件: SSE.hpp 项目: Eynx/R3D
 // Shuffle together two vector's components
 template <Byte X, Byte Y, Byte Z, Byte W> inline Vector VFunction Shuffle(const Vector& vectorA, const Vector& vectorB)
 {
     return _mm_shuffle_ps(vectorA, vectorB, _MM_SHUFFLE(W, Z, Y, X));
 }
示例#7
0
文件: dspXMM.cpp 项目: songo/muse
void
x86_sse_find_peaks(float *buf, unsigned nframes, float *min, float *max)
{
    __m128 current_max, current_min, work;

    // Load max and min values into all four slots of the XMM registers
    current_min = _mm_set1_ps(*min);
    current_max = _mm_set1_ps(*max);

    // Work input until "buf" reaches 16 byte alignment
    while ( ((unsigned long)buf) % 16 != 0 && nframes > 0) {

        // Load the next float into the work buffer
        work = _mm_set1_ps(*buf);

        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);

        buf++;
        nframes--;
    }

    // use 64 byte prefetch for quadruple quads
    while (nframes >= 16) {
        __builtin_prefetch(buf+64,0,0);

        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        work = _mm_load_ps(buf);
        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);
        buf+=4;
        nframes-=16;
    }

    // work through aligned buffers
    while (nframes >= 4) {

        work = _mm_load_ps(buf);

        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);

        buf+=4;
        nframes-=4;
    }

    // work through the rest < 4 samples
    while ( nframes > 0) {

        // Load the next float into the work buffer
        work = _mm_set1_ps(*buf);

        current_min = _mm_min_ps(current_min, work);
        current_max = _mm_max_ps(current_max, work);

        buf++;
        nframes--;
    }

    // Find min & max value in current_max through shuffle tricks

    work = current_min;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
    work = _mm_min_ps (work, current_min);
    current_min = work;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
    work = _mm_min_ps (work, current_min);

    _mm_store_ss(min, work);

    work = current_max;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(2, 3, 0, 1));
    work = _mm_max_ps (work, current_max);
    current_max = work;
    work = _mm_shuffle_ps(work, work, _MM_SHUFFLE(1, 0, 3, 2));
    work = _mm_max_ps (work, current_max);

    _mm_store_ss(max, work);
}
示例#8
0
/* merge "s+s" elements and return sorted result in "dest" array
   TODO(d'b): replace magic numbers with macro */
inline void bitonic_merge_kernel16n(float *dest, float *a, uint32_t sa, float *b /* must not be reversed*/, uint32_t sb)
{
	__m128 ma[4];
	__m128 mb[4];
	__m128 lo[4];
	__m128 hi[4];

#define LOAD16(arg) \
	mb[3] = _mm_load_ps(arg); \
	mb[2] = _mm_load_ps(arg + 4); \
	mb[1] = _mm_load_ps(arg + 8); \
	mb[0] = _mm_load_ps(arg + 12); arg+=16

	float *last_a = a + sa;
	float *last_b = b + sb;
	float *last_dest = dest + sa + sb;

	ma[0] = _mm_load_ps(a); a+=4;
	ma[1] = _mm_load_ps(a); a+=4;
	ma[2] = _mm_load_ps(a); a+=4;
	ma[3] = _mm_load_ps(a); a+=4;

	for(; dest < (last_dest - 16); dest += 16)
	{
		/* Load either a or b */
		if(a < last_a)
		{
			if(b < last_b)
			{
				if(*((uint32_t*)a) < *((uint32_t*)b))
				{
					LOAD16(a);
				} else
				{
					LOAD16(b);
				}
			} else
			{
				LOAD16(a);
			}
		} else
		{
			LOAD16(b);
		}

		/* Reverse *b */
		mb[0] = _mm_shuffle_ps(mb[0], mb[0], 0x1b);
		mb[1] = _mm_shuffle_ps(mb[1], mb[1], 0x1b);
		mb[2] = _mm_shuffle_ps(mb[2], mb[2], 0x1b);
		mb[3] = _mm_shuffle_ps(mb[3], mb[3], 0x1b);

		lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0])));
		hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[0]), _mm_castps_si128(mb[0])));
		lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1])));
		hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[1]), _mm_castps_si128(mb[1])));
		lo[2] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2])));
		hi[2] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[2]), _mm_castps_si128(mb[2])));
		lo[3] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3])));
		hi[3] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(ma[3]), _mm_castps_si128(mb[3])));

		_mm_store_ps(&dest[0], lo[0]);
		_mm_store_ps(&dest[4], lo[1]);
		_mm_store_ps(&dest[8], lo[2]);
		_mm_store_ps(&dest[12], lo[3]);
		_mm_store_ps(&dest[16], hi[2]);
		_mm_store_ps(&dest[20], hi[3]);
		_mm_store_ps(&dest[24], hi[0]);
		_mm_store_ps(&dest[28], hi[1]);

		bitonic_merge_kernel8core(dest, dest + 8);
		bitonic_merge_kernel8core(dest + 16, dest + 24);

		ma[0] = _mm_load_ps(&dest[16]);
		ma[1] = _mm_load_ps(&dest[20]);
		ma[2] = _mm_load_ps(&dest[24]);
		ma[3] = _mm_load_ps(&dest[28]);
	}
}
示例#9
0
// use MMX/SSE extensions
//
// (a + jb)(c + jd) = (ac - bd) + j(ad + bc)
//
// mm_x  = { x[0].real, x[0].imag, x[1].real, x[1].imag }
// mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real }
// mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag }
//
// mm_y0 = mm_x * mm_hi
//       = { x[0].real * h[0].real,
//           x[0].imag * h[0].real,
//           x[1].real * h[1].real,
//           x[1].imag * h[1].real };
//
// mm_y1 = mm_x * mm_hq
//       = { x[0].real * h[0].imag,
//           x[0].imag * h[0].imag,
//           x[1].real * h[1].imag,
//           x[1].imag * h[1].imag };
//
void dotprod_cccf_execute_mmx(dotprod_cccf _q,
                              float complex * _x,
                              float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // temporary buffers
    __m128 v;   // input vector
    __m128 hi;  // coefficients vector (real)
    __m128 hq;  // coefficients vector (imag)
    __m128 ci;  // output multiplication (v * hi)
    __m128 cq;  // output multiplication (v * hq)

    // aligned output array
    float w[4] __attribute__((aligned(16))) = {0,0,0,0};

#if HAVE_PMMINTRIN_H
    // SSE3
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register
#else
    // no SSE3
    float wi[4] __attribute__((aligned(16)));
    float wq[4] __attribute__((aligned(16)));
#endif

    // t = 4*(floor(_n/4))
    unsigned int t = (n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        // {x[0].real, x[0].imag, x[1].real, x[1].imag}
        v = _mm_loadu_ps(&x[i]);

        // load coefficients into register (aligned)
        hi = _mm_load_ps(&_q->hi[i]);
        hq = _mm_load_ps(&_q->hq[i]);

        // compute parallel multiplications
        ci = _mm_mul_ps(v, hi);
        cq = _mm_mul_ps(v, hq);

        // shuffle values
        cq = _mm_shuffle_ps( cq, cq, _MM_SHUFFLE(2,3,0,1) );
        
#if HAVE_PMMINTRIN_H
        // SSE3: combine using addsub_ps()
        s = _mm_addsub_ps( ci, cq );

        // accumulate
        sum = _mm_add_ps(sum, s);
#else
        // no SSE3: combine using slow method
        // FIXME: implement slow method
        // unload values
        _mm_store_ps(wi, ci);
        _mm_store_ps(wq, cq);

        // accumulate
        w[0] += wi[0] - wq[0];
        w[1] += wi[1] + wq[1];
        w[2] += wi[2] - wq[2];
        w[3] += wi[3] + wq[3];
#endif
    }

#if HAVE_PMMINTRIN_H
    // unload packed array
    _mm_store_ps(w, sum);
#endif

    // add in-phase and quadrature components
    w[0] += w[2];   // I
    w[1] += w[3];   // Q

    //float complex total = *((float complex*)w);
    float complex total = w[0] + w[1] * _Complex_I;

    // cleanup
    for (i=t/2; i<_q->n; i++)
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );

    // set return value
    *_y = total;
}
示例#10
0
/* elements are given in 2 arrays (4 and 4),
   result will be returned in the same arrays with a straight order */
inline void bitonic_sort_kernel4(float *a, float *b)
{
	__m128	ma;
	__m128	mb;
	__m128	map;
	__m128	mbp;
	__m128	lo;
	__m128	hi;

	/* load 8 elements to sse registers */
	ma = _mm_load_ps(a);
	mb = _mm_load_ps(b);

	/* In-Register sort */
	map = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */
	mbp = _mm_shuffle_ps(ma, mb, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */
	mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, lo, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */
	mbp = _mm_shuffle_ps(hi, hi, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 0, 1)); /* 0x41: */
	mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 2, 2, 3)); /* 0xeb: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 2, 1, 0)); /* 0xe4: */
	mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(1, 0, 3, 2)); /* 0x4e: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(3, 1, 2, 0)); /* 0xd8: */
	mbp = _mm_shuffle_ps(lo, hi, _MM_SHUFFLE(2, 0, 3, 1)); /* 0x8d: */

	lo = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));
	hi = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map), _mm_castps_si128(mbp)));

	map = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(2, 0, 2, 0)); /* 0x88: */
	mbp = _mm_shuffle_ps(hi, lo, _MM_SHUFFLE(3, 1, 3, 1)); /* 0xdd: */

	map = _mm_shuffle_ps(map, map, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */
	mbp = _mm_shuffle_ps(mbp, mbp, _MM_SHUFFLE(1, 3, 0, 2)); /* 0x72: */

	/* unload sorted elements to memory */
	_mm_store_ps(a, map);
	_mm_store_ps(b, mbp);

	CHECK_RAWS(a, b, 4);
}
示例#11
0
/* merge 2 sorted arrays (8 elements each) to 1 sorted array
   return result (16 elements) in the same arrays
   TODO(d'b): replace magic numbers with macro */
inline void bitonic_merge_kernel8core(float *a, float *b /* must be reversed*/)
{
	__m128	map[2];
	__m128	mbp[2];
	__m128	lo[2];
	__m128	hi[2];

	map[0] = _mm_load_ps(a);
	mbp[0] = _mm_load_ps(b);

	map[1] = _mm_load_ps(a + 4);
	mbp[1] = _mm_load_ps(b + 4);

	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0])));
	hi[0] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(mbp[0])));

	lo[1] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[1]), _mm_castps_si128(mbp[1])));

	map[0] = lo[0];
	map[1] = lo[1];
	mbp[0] = hi[0];
	mbp[1] = hi[1];

	/* L1 processing */
	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));

	map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xe4);
	map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x4e);
	mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xe4);
	mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x4e);

	/* L2 processing */
	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));

	map[0] = _mm_shuffle_ps(lo[0], lo[1], 0xd8);
	map[1] = _mm_shuffle_ps(lo[0], lo[1], 0x8d);
	mbp[0] = _mm_shuffle_ps(hi[0], hi[1], 0xd8);
	mbp[1] = _mm_shuffle_ps(hi[0], hi[1], 0x8d);

	/* L3 processing */
	lo[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	lo[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(map[0]), _mm_castps_si128(map[1])));
	hi[0] = _mm_castsi128_ps (_mm_min_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));
	hi[1] = _mm_castsi128_ps (_mm_max_epu32(_mm_castps_si128(mbp[0]), _mm_castps_si128(mbp[1])));

	map[0] = _mm_shuffle_ps(lo[1], lo[0], 0x88);
	map[1] = _mm_shuffle_ps(lo[1], lo[0], 0xdd);
	mbp[0] = _mm_shuffle_ps(hi[1], hi[0], 0x88);
	mbp[1] = _mm_shuffle_ps(hi[1], hi[0], 0xdd);

	map[0] = _mm_shuffle_ps(map[0], map[0], 0x72);
	map[1] = _mm_shuffle_ps(map[1], map[1], 0x72);
	mbp[0] = _mm_shuffle_ps(mbp[0], mbp[0], 0x72);
	mbp[1] = _mm_shuffle_ps(mbp[1], mbp[1], 0x72);

	_mm_store_ps(&a[0], map[0]);
	_mm_store_ps(&a[4], map[1]);
	_mm_store_ps(&b[0], mbp[0]);
	_mm_store_ps(&b[4], mbp[1]);

	CHECK_RAWS(a, b, 8);
}
示例#12
0
static inline int    sacIsSampleDegenerate(PROSAC_HEST* p){
	unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3];
	
	/**
	 * Pack the matches selected by the SAC algorithm.
	 * Must be packed  points[0:7]  = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3}
	 *                 points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3}
	 * Gather 4 points into the vector
	 */
	
	__m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]);
	src10        = _mm_loadh_pi(src10, (__m64*)&p->src[i1]);
	__m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]);
	src32        = _mm_loadh_pi(src32, (__m64*)&p->src[i3]);
	__m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]);
	dst10        = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]);
	__m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]);
	dst32        = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]);
	
	
	/**
	 * If the matches' source points have common x and y coordinates, abort.
	 */
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[2].x
	 * packedPoints[0].y == packedPoints[2].y
	 * packedPoints[1].x == packedPoints[3].x
	 * packedPoints[1].y == packedPoints[3].y
	 */
	
	__m128 chkEq0 = _mm_cmpeq_ps(src10, src32);
	
	/**
	 * Check:
	 * packedPoints[1].x == packedPoints[2].x
	 * packedPoints[1].y == packedPoints[2].y
	 * packedPoints[0].x == packedPoints[3].x
	 * packedPoints[0].y == packedPoints[3].y
	 */
	
	__m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32);
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[1].x
	 * packedPoints[0].y == packedPoints[1].y
	 * packedPoints[2].x == packedPoints[3].x
	 * packedPoints[2].y == packedPoints[3].y
	 */
	
	__m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)),
	                             _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2)));
	
	/* Verify */
	if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){
		return 1;
	}
	
	/* If the matches do not satisfy the strong geometric constraint, abort. */
	
	/**
	 * p6420x   = (p6.x, p4.x, p2.x, p0.x)
	 * p6420y   = (p6.y, p4.y, p2.y, p0.y)
	 * p7531x   = (p7.x, p5.x, p3.x, p1.x)
	 * p7531y   = (p7.y, p5.y, p3.y, p1.y)
	 * crosssd0 = p6420y - p7531y                     = (cross2d0, cross0d0, cross2s0, cross0s0)
	 * crosssd1 = p7531x - p6420x                     = (cross2d1, cross0d1, cross2s1, cross0s1)
	 * crosssd2 = p6420x * p7531y  -  p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2)
	 * 
	 * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0)
	 * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1)
	 * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2)
	 * 
	 * dotsd0   = shufcrosssd0 * p6420x +
	 *            shufcrosssd1 * p6420y + 
	 *            shufcrosssd2
	 *          = (dotd0, dotd2, dots0, dots2)
	 * dotsd1   = shufcrosssd0 * p7531x +
	 *            shufcrosssd1 * p7531y + 
	 *            shufcrosssd2
	 *          = (dotd1, dotd3, dots1, dots3)
	 * 
	 * dots     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0))
	 * dotd     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2))
	 *            movmaskps(dots ^ dotd)
	 */
	
	__m128 p3210x       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p3210y       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7654x       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7654y       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p6420x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p6420y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7531x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7531y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1));
	
	__m128 crosssd0     = _mm_sub_ps(p6420y, p7531y);
	__m128 crosssd1     = _mm_sub_ps(p7531x, p6420x);
	__m128 crosssd2     = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x));
	
	__m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1));
	
	__m128 dotsd0       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x),
	                                            _mm_mul_ps(shufcrosssd1, p6420y)),
	                                 shufcrosssd2);
	__m128 dotsd1       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x),
	                                            _mm_mul_ps(shufcrosssd1, p7531y)),
	                                 shufcrosssd2);
	
	__m128 dots         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1));
	__m128 dotd         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3));
	
	//if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){
	if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){
		return 1;
	}
	
	
	/* Otherwise, proceed with evaluation */
	_mm_store_ps((float*)&p->pkdPts[0], src10);
	_mm_store_ps((float*)&p->pkdPts[2], src32);
	_mm_store_ps((float*)&p->pkdPts[4], dst10);
	_mm_store_ps((float*)&p->pkdPts[6], dst32);
	
	return 0;
}
static void rftfsub_128_SSE2(float *a) {
  const float *c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  static const ALIGN16_BEG float ALIGN16_END k_half[4] =
    {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const __m128 c_j1 = _mm_loadu_ps(&c[     j1]);         //  1,  2,  3,  4,
    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);         // 28, 29, 30, 31,
    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);         // 28, 29, 30, 31,
    const __m128 wkr_ =
      _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3)); // 31, 30, 29, 28,
    const __m128 wki_ = c_j1;                              //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    const __m128 a_j2_0 = _mm_loadu_ps(&a[0   + j2]);  //   2,   3,   4,   5,
    const __m128 a_j2_4 = _mm_loadu_ps(&a[4   + j2]);  //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
    const __m128 a_j2_p0 = _mm_shuffle_ps(a_j2_0, a_j2_4,
                            _MM_SHUFFLE(2, 0, 2 ,0));  //   2,   4,   6,   8,
    const __m128 a_j2_p1 = _mm_shuffle_ps(a_j2_0, a_j2_4,
                            _MM_SHUFFLE(3, 1, 3 ,1));  //   3,   5,   7,   9,
    const __m128 a_k2_p0 = _mm_shuffle_ps(a_k2_4, a_k2_0,
                            _MM_SHUFFLE(0, 2, 0 ,2));  // 126, 124, 122, 120,
    const __m128 a_k2_p1 = _mm_shuffle_ps(a_k2_4, a_k2_0,
                            _MM_SHUFFLE(1, 3, 1 ,3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
                                               // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
                                               // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr - wki * xi;
    //    yi = wkr * xi + wki * xr;
    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
    const __m128 yr_ = _mm_sub_ps(a_, b_);     // 2-126, 4-124, 6-122, 8-120,
    const __m128 yi_ = _mm_add_ps(c_, d_);     // 3-127, 5-125, 7-123, 9-121,
    // Update 'a'.
    //    a[j2 + 0] -= yr;
    //    a[j2 + 1] -= yi;
    //    a[k2 + 0] += yr;
    //    a[k2 + 1] -= yi;
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(a_j2_p1, yi_);  //   3,   5,   7,   9,
    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
    const __m128 a_k2_p1n = _mm_sub_ps(a_k2_p1, yi_);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
                                                       //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
                                                       //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
                                                       // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
                                                       // 126, 127, 124, 125,
    const __m128 a_k2_0n = _mm_shuffle_ps(a_k2_0nt, a_k2_0nt,
                            _MM_SHUFFLE(1, 0, 3 ,2));  // 120, 121, 122, 123,
    const __m128 a_k2_4n = _mm_shuffle_ps(a_k2_4nt, a_k2_4nt,
                            _MM_SHUFFLE(1, 0, 3 ,2));  // 124, 125, 126, 127,
    _mm_storeu_ps(&a[0   + j2], a_j2_0n);
    _mm_storeu_ps(&a[4   + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
    k1 =  32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
    xi = a[j2 + 1] + a[k2 + 1];
    yr = wkr * xr - wki * xi;
    yi = wkr * xi + wki * xr;
    a[j2 + 0] -= yr;
    a[j2 + 1] -= yi;
    a[k2 + 0] += yr;
    a[k2 + 1] -= yi;
  }
}
示例#14
0
static int 
backward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, const P7_OMX *fwd, P7_OMX *bck, float *opt_sc)
{
  register __m128 mpv, ipv, dpv;      /* previous row values                                       */
  register __m128 mcv, dcv;           /* current row values                                        */
  register __m128 tmmv, timv, tdmv;   /* tmp vars for accessing rotated transition scores          */
  register __m128 xBv;		      /* collects B->Mk components of B(i)                         */
  register __m128 xEv;	              /* splatted E(i)                                             */
  __m128   zerov;		      /* splatted 0.0's in a vector                                */
  float    xN, xE, xB, xC, xJ;	      /* special states' scores                                    */
  int      i;			      /* counter over sequence positions 0,1..L                    */
  int      q;			      /* counter over quads 0..Q-1                                 */
  int      Q       = p7O_NQF(om->M);  /* segment length: # of vectors                              */
  int      j;			      /* DD segment iteration counter (4 = full serialization)     */
  __m128  *dpc;                       /* current DP row                                            */
  __m128  *dpp;			      /* next ("previous") DP row                                  */
  __m128  *rp;			      /* will point into om->rfv[x] for residue x[i+1]             */
  __m128  *tp;		              /* will point into (and step thru) om->tfv transition scores */

  /* initialize the L row. */
  bck->M = om->M;
  bck->L = L;
  bck->has_own_scales = FALSE;	/* backwards scale factors are *usually* given by <fwd> */
  dpc    = bck->dpf[L * do_full];
  xJ     = 0.0;
  xB     = 0.0;
  xN     = 0.0;
  xC     = om->xf[p7O_C][p7O_MOVE];      /* C<-T */
  xE     = xC * om->xf[p7O_E][p7O_MOVE]; /* E<-C, no tail */
  xEv    = _mm_set1_ps(xE); 
  zerov  = _mm_setzero_ps();  
  dcv    = zerov;		/* solely to silence a compiler warning */
  for (q = 0; q < Q; q++) MMO(dpc,q) = DMO(dpc,q) = xEv;
  for (q = 0; q < Q; q++) IMO(dpc,q) = zerov;

  /* init row L's DD paths, 1) first segment includes xE, from DMO(q) */
  tp  = om->tfv + 8*Q - 1;	                        /* <*tp> now the [4 8 12 x] TDD quad         */
  dpv = _mm_move_ss(DMO(dpc,Q-1), zerov);               /* start leftshift: [1 5 9 13] -> [x 5 9 13] */
  dpv = _mm_shuffle_ps(dpv, dpv, _MM_SHUFFLE(0,3,2,1)); /* finish leftshift:[x 5 9 13] -> [5 9 13 x] */
  for (q = Q-1; q >= 0; q--)
    {
      dcv        = _mm_mul_ps(dpv, *tp);      tp--;
      DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), dcv);
      dpv        = DMO(dpc,q);
    }
  /* 2) three more passes, only extending DD component (dcv only; no xE contrib from DMO(q)) */
  for (j = 1; j < 4; j++)
    {
      tp  = om->tfv + 8*Q - 1;	                            /* <*tp> now the [4 8 12 x] TDD quad         */
      dcv = _mm_move_ss(dcv, zerov);                        /* start leftshift: [1 5 9 13] -> [x 5 9 13] */
      dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1)); /* finish leftshift:[x 5 9 13] -> [5 9 13 x] */
      for (q = Q-1; q >= 0; q--)
	{
	  dcv        = _mm_mul_ps(dcv, *tp); tp--;
	  DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), dcv);
	}
    }
  /* now MD init */
  tp  = om->tfv + 7*Q - 3;	                        /* <*tp> now the [4 8 12 x] Mk->Dk+1 quad    */
  dcv = _mm_move_ss(DMO(dpc,0), zerov);                 /* start leftshift: [1 5 9 13] -> [x 5 9 13] */
  dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1)); /* finish leftshift:[x 5 9 13] -> [5 9 13 x] */
  for (q = Q-1; q >= 0; q--)
    {
      MMO(dpc,q) = _mm_add_ps(MMO(dpc,q), _mm_mul_ps(dcv, *tp)); tp -= 7;
      dcv        = DMO(dpc,q);
    }

  /* Sparse rescaling: same scale factors as fwd matrix */
  if (fwd->xmx[L*p7X_NXCELLS+p7X_SCALE] > 1.0)
    {
      xE  = xE / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xN  = xN / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xC  = xC / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xJ  = xJ / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xB  = xB / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
      xEv = _mm_set1_ps(1.0 / fwd->xmx[L*p7X_NXCELLS+p7X_SCALE]);
      for (q = 0; q < Q; q++) {
	MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv);
	DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv);
	IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv);
      }
    }
  bck->xmx[L*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[L*p7X_NXCELLS+p7X_SCALE];
  bck->totscale                     = log(bck->xmx[L*p7X_NXCELLS+p7X_SCALE]);

  /* Stores */
  bck->xmx[L*p7X_NXCELLS+p7X_E] = xE;
  bck->xmx[L*p7X_NXCELLS+p7X_N] = xN;
  bck->xmx[L*p7X_NXCELLS+p7X_J] = xJ;
  bck->xmx[L*p7X_NXCELLS+p7X_B] = xB;
  bck->xmx[L*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
  if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, L, 9, 4, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=L, width=9, precision=4*/
#endif

  /* main recursion */
  for (i = L-1; i >= 1; i--)	/* backwards stride */
    {
      /* phase 1. B(i) collected. Old row destroyed, new row contains
       *    complete I(i,k), partial {MD}(i,k) w/ no {MD}->{DE} paths yet.
       */
      dpc = bck->dpf[i     * do_full];
      dpp = bck->dpf[(i+1) * do_full];
      rp  = om->rfv[dsq[i+1]] + Q-1; /* <*rp> is now the [4 8 12 x] match emission quad */
      tp  = om->tfv + 7*Q - 1;	     /* <*tp> is now the [4 8 12 x] TII transition quad  */

      /* leftshift the first transition quads */
      tmmv = _mm_move_ss(om->tfv[1], zerov); tmmv = _mm_shuffle_ps(tmmv, tmmv, _MM_SHUFFLE(0,3,2,1));
      timv = _mm_move_ss(om->tfv[2], zerov); timv = _mm_shuffle_ps(timv, timv, _MM_SHUFFLE(0,3,2,1));
      tdmv = _mm_move_ss(om->tfv[3], zerov); tdmv = _mm_shuffle_ps(tdmv, tdmv, _MM_SHUFFLE(0,3,2,1));

      mpv = _mm_mul_ps(MMO(dpp,0), om->rfv[dsq[i+1]][0]); /* precalc M(i+1,k+1) * e(M_k+1, x_{i+1}) */
      mpv = _mm_move_ss(mpv, zerov);
      mpv = _mm_shuffle_ps(mpv, mpv, _MM_SHUFFLE(0,3,2,1));

      xBv = zerov;
      for (q = Q-1; q >= 0; q--)     /* backwards stride */
	{
	  ipv = IMO(dpp,q); /* assumes emission odds ratio of 1.0; i+1's IMO(q) now free */
	  IMO(dpc,q) = _mm_add_ps(_mm_mul_ps(ipv, *tp), _mm_mul_ps(mpv, timv));   tp--;
	  DMO(dpc,q) =                                  _mm_mul_ps(mpv, tdmv); 
	  mcv        = _mm_add_ps(_mm_mul_ps(ipv, *tp), _mm_mul_ps(mpv, tmmv));   tp-= 2;
	  
	  mpv        = _mm_mul_ps(MMO(dpp,q), *rp);  rp--;  /* obtain mpv for next q. i+1's MMO(q) is freed  */
	  MMO(dpc,q) = mcv;

	  tdmv = *tp;   tp--;
	  timv = *tp;   tp--;
	  tmmv = *tp;   tp--;

	  xBv = _mm_add_ps(xBv, _mm_mul_ps(mpv, *tp)); tp--;
	}

      /* phase 2: now that we have accumulated the B->Mk transitions in xBv, we can do the specials */
      /* this incantation is a horiz sum of xBv elements: (_mm_hadd_ps() would require SSE3) */
      xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(0, 3, 2, 1)));
      xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(1, 0, 3, 2)));
      _mm_store_ss(&xB, xBv);

      xC =  xC * om->xf[p7O_C][p7O_LOOP];
      xJ = (xB * om->xf[p7O_J][p7O_MOVE]) + (xJ * om->xf[p7O_J][p7O_LOOP]); /* must come after xB */
      xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]); /* must come after xB */
      xE = (xC * om->xf[p7O_E][p7O_MOVE]) + (xJ * om->xf[p7O_E][p7O_LOOP]); /* must come after xJ, xC */
      xEv = _mm_set1_ps(xE);	/* splat */


      /* phase 3: {MD}->E paths and one step of the D->D paths */
      tp  = om->tfv + 8*Q - 1;	/* <*tp> now the [4 8 12 x] TDD quad */
      dpv = _mm_add_ps(DMO(dpc,0), xEv);
      dpv = _mm_move_ss(dpv, zerov);
      dpv = _mm_shuffle_ps(dpv, dpv, _MM_SHUFFLE(0,3,2,1));
      for (q = Q-1; q >= 0; q--)
	{
	  dcv        = _mm_mul_ps(dpv, *tp); tp--;
	  DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), _mm_add_ps(dcv, xEv));
	  dpv        = DMO(dpc,q);
	  MMO(dpc,q) = _mm_add_ps(MMO(dpc,q), xEv);
	}
      
      /* phase 4: finish extending the DD paths */
      /* fully serialized for now */
      for (j = 1; j < 4; j++)	/* three passes: we've already done 1 segment, we need 4 total */
	{
	  dcv = _mm_move_ss(dcv, zerov);
	  dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1));
	  tp  = om->tfv + 8*Q - 1;	/* <*tp> now the [4 8 12 x] TDD quad */
	  for (q = Q-1; q >= 0; q--)
	    {
	      dcv        = _mm_mul_ps(dcv, *tp); tp--;
	      DMO(dpc,q) = _mm_add_ps(DMO(dpc,q), dcv);
	    }
	}

      /* phase 5: add M->D paths */
      dcv = _mm_move_ss(DMO(dpc,0), zerov);
      dcv = _mm_shuffle_ps(dcv, dcv, _MM_SHUFFLE(0,3,2,1));
      tp  = om->tfv + 7*Q - 3;	/* <*tp> is now the [4 8 12 x] Mk->Dk+1 quad */
      for (q = Q-1; q >= 0; q--)
	{
	  MMO(dpc,q) = _mm_add_ps(MMO(dpc,q), _mm_mul_ps(dcv, *tp)); tp -= 7;
	  dcv        = DMO(dpc,q);
	}

      /* Sparse rescaling  */

      /* In rare cases [J3/119] scale factors from <fwd> are
       * insufficient and backwards will overflow. In this case, we
       * switch on the fly to using our own scale factors, different
       * from those in <fwd>. This will complicate subsequent
       * posterior decoding routines.
       */
      if (xB > 1.0e16) bck->has_own_scales = TRUE;

      if      (bck->has_own_scales)  bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = (xB > 1.0e4) ? xB : 1.0;
      else                           bck->xmx[i*p7X_NXCELLS+p7X_SCALE] = fwd->xmx[i*p7X_NXCELLS+p7X_SCALE];

      if (bck->xmx[i*p7X_NXCELLS+p7X_SCALE] > 1.0)
	{
	  xE /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xN /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xJ /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xB /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xC /= bck->xmx[i*p7X_NXCELLS+p7X_SCALE];
	  xBv = _mm_set1_ps(1.0 / bck->xmx[i*p7X_NXCELLS+p7X_SCALE]);
	  for (q = 0; q < Q; q++) {
	    MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xBv);
	    DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xBv);
	    IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xBv);
	  }
	  bck->totscale += log(bck->xmx[i*p7X_NXCELLS+p7X_SCALE]);
	}

      /* Stores are separate only for pedagogical reasons: easy to
       * turn this into a more memory efficient version just by
       * deleting the stores.
       */
      bck->xmx[i*p7X_NXCELLS+p7X_E] = xE;
      bck->xmx[i*p7X_NXCELLS+p7X_N] = xN;
      bck->xmx[i*p7X_NXCELLS+p7X_J] = xJ;
      bck->xmx[i*p7X_NXCELLS+p7X_B] = xB;
      bck->xmx[i*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
      if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, i, 9, 4, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=i, width=9, precision=4*/
#endif
    } /* thus ends the loop over sequence positions i */

  /* Termination at i=0, where we can only reach N,B states. */
  dpp = bck->dpf[1 * do_full];
  tp  = om->tfv;          /* <*tp> is now the [1 5 9 13] TBMk transition quad  */
  rp  = om->rfv[dsq[1]];  /* <*rp> is now the [1 5 9 13] match emission quad   */
  xBv = zerov;
  for (q = 0; q < Q; q++)
    {
      mpv = _mm_mul_ps(MMO(dpp,q), *rp);  rp++;
      mpv = _mm_mul_ps(mpv,        *tp);  tp += 7;
      xBv = _mm_add_ps(xBv,        mpv);
    }
  /* horizontal sum of xBv */
  xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(0, 3, 2, 1)));
  xBv = _mm_add_ps(xBv, _mm_shuffle_ps(xBv, xBv, _MM_SHUFFLE(1, 0, 3, 2)));
  _mm_store_ss(&xB, xBv);
 
  xN = (xB * om->xf[p7O_N][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_LOOP]);  

  bck->xmx[p7X_B]     = xB;
  bck->xmx[p7X_C]     = 0.0;
  bck->xmx[p7X_J]     = 0.0;
  bck->xmx[p7X_N]     = xN;
  bck->xmx[p7X_E]     = 0.0;
  bck->xmx[p7X_SCALE] = 1.0;

#if p7_DEBUGGING
  dpc = bck->dpf[0];
  for (q = 0; q < Q; q++) /* Not strictly necessary, but if someone's looking at DP matrices, this is nice to do: */
    MMO(dpc,q) = DMO(dpc,q) = IMO(dpc,q) = zerov;
  if (bck->debugging) p7_omx_DumpFBRow(bck, TRUE, 0, 9, 4, bck->xmx[p7X_E], bck->xmx[p7X_N],  bck->xmx[p7X_J], bck->xmx[p7X_B],  bck->xmx[p7X_C]);	/* logify=TRUE, <rowi>=0, width=9, precision=4*/
#endif

  if       (isnan(xN))        ESL_EXCEPTION(eslERANGE, "backward score is NaN");
  else if  (L>0 && xN == 0.0) ESL_EXCEPTION(eslERANGE, "backward score underflow (is 0.0)");    /* if L==0, xN *should* be 0.0 [J5/118]*/
  else if  (isinf(xN) == 1)   ESL_EXCEPTION(eslERANGE, "backward score overflow (is infinity)");

  if (opt_sc != NULL) *opt_sc = bck->totscale + log(xN);
  return eslOK;
}
mlib_status
mlib_ImageColorConvert2_F32(
    const mlib_f32 *src,
    mlib_s32 slb,
    mlib_f32 *dst,
    mlib_s32 dlb,
    mlib_s32 xsize,
    mlib_s32 ysize,
    const mlib_d64 *fmat,
    const mlib_d64 *offset)
{
	/* pointers for pixel and line of source */
	mlib_f32 *sa, *sl;

	/* pointers for pixel and line of destination */
	mlib_f32 *da, *dl;

	/* indices */
	mlib_s32 i, j;

	/* intermediate */
	__m128 p0, p1, p2, t0, t1, t2, s0, s1, q;

	/* packed kernel */
	__m128 k0, k1, k2;

	/* packed offset */
	__m128 off;

	/* load transposed kernel */
	k0 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[6],
			(mlib_f32)fmat[3],
			(mlib_f32)fmat[0]);
	k1 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[7],
			(mlib_f32)fmat[4],
			(mlib_f32)fmat[1]);
	k2 = _mm_set_ps(0.0f,
			(mlib_f32)fmat[8],
			(mlib_f32)fmat[5],
			(mlib_f32)fmat[2]);

	/* load offset */
	off = _mm_set_ps(0.0f,
			(mlib_f32)offset[2],
			(mlib_f32)offset[1],
			(mlib_f32)offset[0]);

	sa = sl = (mlib_f32 *)src;
	da = dl = dst;

	for (j = 0; j < ysize; j++) {

#ifdef __SUNPRO_C
#pragma pipeloop(0)
#endif /* __SUNPRO_C */
		for (i = 0; i < (xsize - 1); i ++) {
			p0 = _mm_load1_ps(sa);
			sa ++;
			p1 = _mm_load1_ps(sa);
			sa ++;
			p2 = _mm_load1_ps(sa);
			sa ++;

			t0 = _mm_mul_ps(p0, k0);
			t1 = _mm_mul_ps(p1, k1);
			t2 = _mm_mul_ps(p2, k2);

			s0 = _mm_add_ps(t0, t1);
			s1 = _mm_add_ps(t2, off);
			q = _mm_add_ps(s0, s1);

			_mm_storeu_ps(da, q);
			da += 3;
		}

		/*
		 * process the last pixel of each row separately
		 * to avoid out of bound write
		 */
		p0 = _mm_load1_ps(sa);
		sa ++;
		p1 = _mm_load1_ps(sa);
		sa ++;
		p2 = _mm_load1_ps(sa);
		sa ++;

		t0 = _mm_mul_ps(p0, k0);
		t1 = _mm_mul_ps(p1, k1);
		t2 = _mm_mul_ps(p2, k2);

		s0 = _mm_add_ps(t0, t1);
		s1 = _mm_add_ps(t2, off);
		q = _mm_add_ps(s0, s1);

		_mm_storel_pi((__m64 *)da, q);
		da += 2;
		q = _mm_shuffle_ps(q, q, 0xaa);
		_mm_store_ss(da, q);

		/* set src pointer to next row */
		sa = sl = sl + slb;
		/* set dst pointer to next row */
		da = dl = dl + dlb;
	}

	return (MLIB_SUCCESS);
}
示例#16
0
// use MMX/SSE extensions
void dotprod_cccf_execute_mmx4(dotprod_cccf _q,
                               float complex * _x,
                               float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // first cut: ...
    __m128 v0,  v1,  v2,  v3;   // input vectors
    __m128 hi0, hi1, hi2, hi3;  // coefficients vectors (real)
    __m128 hq0, hq1, hq2, hq3;  // coefficients vectors (imag)
    __m128 ci0, ci1, ci2, ci3;  // output multiplications (v * hi)
    __m128 cq0, cq1, cq2, cq3;  // output multiplications (v * hq)

    // load zeros into sum registers
    __m128 sumi = _mm_setzero_ps();
    __m128 sumq = _mm_setzero_ps();

    // r = 4*floor(n/16)
    unsigned int r = (n >> 4) << 2;

    //
    unsigned int i;
    for (i=0; i<r; i+=4) {
        // load inputs into register (unaligned)
        v0 = _mm_loadu_ps(&x[4*i+0]);
        v1 = _mm_loadu_ps(&x[4*i+4]);
        v2 = _mm_loadu_ps(&x[4*i+8]);
        v3 = _mm_loadu_ps(&x[4*i+12]);

        // load real coefficients into registers (aligned)
        hi0 = _mm_load_ps(&_q->hi[4*i+0]);
        hi1 = _mm_load_ps(&_q->hi[4*i+4]);
        hi2 = _mm_load_ps(&_q->hi[4*i+8]);
        hi3 = _mm_load_ps(&_q->hi[4*i+12]);

        // load real coefficients into registers (aligned)
        hq0 = _mm_load_ps(&_q->hq[4*i+0]);
        hq1 = _mm_load_ps(&_q->hq[4*i+4]);
        hq2 = _mm_load_ps(&_q->hq[4*i+8]);
        hq3 = _mm_load_ps(&_q->hq[4*i+12]);
        
        // compute parallel multiplications (real)
        ci0 = _mm_mul_ps(v0, hi0);
        ci1 = _mm_mul_ps(v1, hi1);
        ci2 = _mm_mul_ps(v2, hi2);
        ci3 = _mm_mul_ps(v3, hi3);

        // compute parallel multiplications (imag)
        cq0 = _mm_mul_ps(v0, hq0);
        cq1 = _mm_mul_ps(v1, hq1);
        cq2 = _mm_mul_ps(v2, hq2);
        cq3 = _mm_mul_ps(v3, hq3);

        // accumulate
        sumi = _mm_add_ps(sumi, ci0);   sumq = _mm_add_ps(sumq, cq0);
        sumi = _mm_add_ps(sumi, ci1);   sumq = _mm_add_ps(sumq, cq1);
        sumi = _mm_add_ps(sumi, ci2);   sumq = _mm_add_ps(sumq, cq2);
        sumi = _mm_add_ps(sumi, ci3);   sumq = _mm_add_ps(sumq, cq3);
    }

    // shuffle values
    sumq = _mm_shuffle_ps( sumq, sumq, _MM_SHUFFLE(2,3,0,1) );

    // unload
    float wi[4] __attribute__((aligned(16)));
    float wq[4] __attribute__((aligned(16)));
    _mm_store_ps(wi, sumi);
    _mm_store_ps(wq, sumq);

    // fold down (add/sub)
    float complex total = 
        ((wi[0] - wq[0]) + (wi[2] - wq[2])) +
        ((wi[1] + wq[1]) + (wi[3] + wq[3])) * _Complex_I;

    // cleanup (note: n _must_ be even)
    // TODO : clean this method up
    for (i=2*r; i<_q->n; i++) {
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );
    }

    // set return value
    *_y = total;
}
示例#17
0
文件: SSE.hpp 项目: Eynx/R3D
 // Permute a vector's values across its components
 template <Byte X, Byte Y, Byte Z, Byte W> inline Vector VFunction Permute(const Vector& vector)
 {
     return _mm_shuffle_ps(vector, vector, _MM_SHUFFLE(W, Z, Y, X));
 }
示例#18
0
 /*!
  * \brief Perform an horizontal sum of the given vector.
  * \param in The input vector type
  * \return the horizontal sum of the vector
  */
 ETL_STATIC_INLINE(float) hadd(avx_simd_float in) {
     const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
     const __m128 x64  = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
     const __m128 x32  = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
     return _mm_cvtss_f32(x32);
 }
示例#19
0
/*
============
idSIMD_SSE::CmpLT

  dst[i] |= ( src0[i] < constant ) << bitNum;
============
*/
void VPCALL idSIMD_SSE2::CmpLT( byte *dst, const byte bitNum, const float *src0, const float constant, const int count ) {
	int i, cnt, pre, post;
	float *aligned;
	__m128 xmm0, xmm1;
	__m128i xmm0i;
	int cnt_l;
	char *src0_p;
	char *constant_p;
	char *dst_p;
	int mask_l;
	int dst_l;
	
	/* if the float array is not aligned on a 4 byte boundary */
	if ( ((int) src0) & 3 ) {
		/* unaligned memory access */
		pre = 0;
		cnt = count >> 2;
		post = count - (cnt<<2);

	/*
		__asm	mov			edx, cnt													
		__asm	test		edx, edx													
		__asm	je			doneCmp														
	*/
		cnt_l = cnt;
		if(cnt_l != 0) {
	/*
		__asm	push		ebx															
		__asm	neg			edx															
		__asm	mov			esi, src0													
		__asm	prefetchnta	[esi+64]													
		__asm	movss		xmm1, constant												
		__asm	shufps		xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 )						
		__asm	mov			edi, dst													
		__asm	mov			cl, bitNum	
	*/
			cnt_l = -cnt_l;
			src0_p = (char *) src0;
			_mm_prefetch(src0_p+64, _MM_HINT_NTA);
			constant_p = (char *) &constant;
			xmm1 = _mm_load_ss((float *)constant_p);
			xmm1 = _mm_shuffle_ps(xmm1, xmm1, R_SHUFFLEPS( 0, 0, 0, 0 ));
			dst_p = (char *)dst;
	/*
			__asm loopNA:																	
	*/
			do {
	/*
		__asm	movups		xmm0, [esi]													
		__asm	prefetchnta	[esi+128]													
		__asm	cmpltps		xmm0, xmm1													
		__asm	movmskps	eax, xmm0																												\
		__asm	mov			ah, al														
		__asm	shr			ah, 1														
		__asm	mov			bx, ax														
		__asm	shl			ebx, 14														
		__asm	mov			bx, ax														
		__asm	and			ebx, 0x01010101												
		__asm	shl			ebx, cl														
		__asm	or			ebx, dword ptr [edi]										
		__asm	mov			dword ptr [edi], ebx										
		__asm	add			esi, 16														
		__asm	add			edi, 4														
		__asm	inc			edx															
		__asm	jl			loopNA														
		__asm	pop			ebx		
	*/
				xmm0 = _mm_loadu_ps((float *) src0_p);
				_mm_prefetch(src0_p+128, _MM_HINT_NTA);
				xmm0 = _mm_cmplt_ps(xmm0, xmm1);
				// Simplify using SSE2
				xmm0i = (__m128i) xmm0;
				xmm0i = _mm_packs_epi32(xmm0i, xmm0i);
				xmm0i = _mm_packs_epi16(xmm0i, xmm0i);
				mask_l = _mm_cvtsi128_si32(xmm0i);
				// End 
				mask_l = mask_l &  0x01010101;
				mask_l = mask_l << bitNum;
				dst_l  = *((int *) dst_p);
				mask_l = mask_l | dst_l;
				*((int *) dst_p) = mask_l;
				src0_p = src0_p + 16;
				dst_p = dst_p + 4;
				cnt_l = cnt_l + 1;
			} while (cnt_l < 0);
		}													
	}																					
示例#20
0
	float Matrix4_M128::Inverse(Matrix4_M128 &mOut) const
	{
		__m128 Fac0;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac0 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac1;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac1 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac2;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac2 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac3;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac3 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac4;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac4 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac5;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac5 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
		__m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);

		__m128 Temp0 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(0, 0, 0, 0));
		__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Temp1 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(1, 1, 1, 1));
		__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Temp2 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(2, 2, 2, 2));
		__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Temp3 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(3, 3, 3, 3));
		__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
		__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
		__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
		__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
		__m128 Add00 = _mm_add_ps(Sub00, Mul02);
		__m128 Inv0 = _mm_mul_ps(SignB, Add00);

		__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
		__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
		__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
		__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
		__m128 Add01 = _mm_add_ps(Sub01, Mul05);
		__m128 Inv1 = _mm_mul_ps(SignA, Add01);

		__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
		__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
		__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
		__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
		__m128 Add02 = _mm_add_ps(Sub02, Mul08);
		__m128 Inv2 = _mm_mul_ps(SignB, Add02);

		__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
		__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
		__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
		__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
		__m128 Add03 = _mm_add_ps(Sub03, Mul11);
		__m128 Inv3 = _mm_mul_ps(SignA, Add03);

		__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
		__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
		__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));

		// Det0 = dot(C1, Row2)
		__m128 mul0 = _mm_mul_ps(C1, Row2);
		__m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
		__m128 add0 = _mm_add_ps(mul0, swp0);
		__m128 swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3));
		__m128 Det0 = _mm_add_ps(add0, swp1);

		__m128 Rcp0 = _mm_div_ps(VecOne, Det0);

		mOut.C1 = _mm_mul_ps(Inv0, Rcp0);
		mOut.C2 = _mm_mul_ps(Inv1, Rcp0);
		mOut.C3 = _mm_mul_ps(Inv2, Rcp0);
		mOut.C4 = _mm_mul_ps(Inv3, Rcp0);

		float retVal;
		_mm_store_ss(&retVal, Det0);
		return retVal;
	}
double bst_compute_123_m128_unaligned8_maskstore( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m128d v_tmp;
    __m128d v00, v01, v02, v03;
    __m128d v10, v11, v12, v13;
    __m128d v20, v21, v22, v23;
    __m128d v30, v31, v32, v33;
    __m128i v_cur_roots;
    __m128 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx2, idx3;
    
    idx1 = IDX(n,n);
    e[idx1] = q[n];
    idx1++;
    for (i = n-1; i >= 0; --i) {
        idx1 -= 2*(n-i)+1;
        idx2 = idx1 + 1;
        e[idx1] = q[i];
        w[idx1] = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        idx3 = idx1; 
        for (r = i; r < n; ++r) {
            // idx2 = IDX(r+1, r+1);
            idx1 = idx3;
            l_end = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp = e[idx1++];
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&7);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1] = r;
                }
                idx1++;
            }
            
            v_tmp = _mm_set_pd( e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm_set_epi32(r, r, r, r);
            for( ; idx2 < l_end; idx2 += 8 ) {
                v01 = _mm_loadu_pd( &w[idx1  ] );
                v11 = _mm_loadu_pd( &w[idx1+2] );
                v21 = _mm_loadu_pd( &w[idx1+4] );
                v31 = _mm_loadu_pd( &w[idx1+6] );

                v00 = _mm_loadu_pd( &e[idx2  ] );
                v01 = _mm_add_pd( v01, v_tmp ); 
                v10 = _mm_loadu_pd( &e[idx2+2] );
                v11 = _mm_add_pd( v11, v_tmp );
                v20 = _mm_loadu_pd( &e[idx2+4] );
                v21 = _mm_add_pd( v21, v_tmp );
                v30 = _mm_loadu_pd( &e[idx2+6] );
                v31 = _mm_add_pd( v31, v_tmp );

                v01 = _mm_add_pd( v01, v00 );
                v03 = _mm_loadu_pd( &e[idx1  ] );
                v11 = _mm_add_pd( v11, v10 );
                v13 = _mm_loadu_pd( &e[idx1+2] );
                v21 = _mm_add_pd( v21, v20 );
                v23 = _mm_loadu_pd( &e[idx1+4] );
                v31 = _mm_add_pd( v31, v30 );
                v33 = _mm_loadu_pd( &e[idx1+6] );

                v02 = _mm_cmplt_pd( v01, v03 );
                v12 = _mm_cmplt_pd( v11, v13 );
                v22 = _mm_cmplt_pd( v21, v23 );
                v32 = _mm_cmplt_pd( v31, v33 );

                _mm_maskstore_pd( &e[idx1  ],
                        _mm_castpd_si128( v02 ), v01 );
                _mm_maskstore_pd( &e[idx1+2],
                        _mm_castpd_si128( v12 ), v11 );
                _mm_maskstore_pd( &e[idx1+4],
                        _mm_castpd_si128( v22 ), v21 );
                _mm_maskstore_pd( &e[idx1+6], 
                        _mm_castpd_si128( v32 ), v31 );

                v_rootmask0 = _mm_shuffle_ps(
                        _mm_castpd_ps( v02 ),
                        _mm_castpd_ps( v12 ),
                        _MM_SHUFFLE(0,2,0,2) );
                v_rootmask1 = _mm_shuffle_ps(
                        _mm_castpd_ps( v12 ),
                        _mm_castpd_ps( v22 ),
                        _MM_SHUFFLE(0,2,0,2) );

                _mm_maskstore_ps( &root[idx1],
                        _mm_castps_si128( v_rootmask0 ),
                        _mm_castsi128_ps( v_cur_roots ) );
                _mm_maskstore_ps( &root[idx1+4],
                        _mm_castps_si128( v_rootmask1 ),
                        _mm_castsi128_ps( v_cur_roots ) );
                
                idx1 += 8;
            }
            idx3++;
        }
    }


    return e[IDX(0,n)];
}
示例#22
0
void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, const mxArray *prhs[])
{
        const float * kf   = coeff;
        float * src = _src;
        float * dst = _dst;
        int i = 0, k, nz = length;
        
        // float delta = 0.000001f;
        __m128 d4 = _mm_setzero_ps();
        
        float * S;
        
        __m128 s0, s1, s2, s3, 
               t0, t1, t2, t3;
        __m128 f;
        
        for(i = 0; i <= width - 16; i += 16 )
        {
            s0 = d4, s1 = d4, s2 = d4, s3 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);  // (__m128 f, __m128 f, unsigned int imm8)
                S = src + i + k;

                t0 = _mm_loadu_ps(S);
                t1 = _mm_loadu_ps(S + 4);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

                t0 = _mm_loadu_ps(S + 8);
                t1 = _mm_loadu_ps(S + 12);
                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
            }

            _mm_storeu_ps(dst + i, s0);
            _mm_storeu_ps(dst + i + 4, s1);
            _mm_storeu_ps(dst + i + 8, s2);
            _mm_storeu_ps(dst + i + 12, s3);
        }
// 
        for( ; i <= width - 4; i += 4 )
        {
            s0 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);
                t0 = _mm_loadu_ps(src + k + i);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
            }
            _mm_storeu_ps(dst + i, s0);
        }
        
        for (; i < width; i++)
        {
            for( k = 0; k < nz; k++ )
            {
                *(dst + i) += *(src + i + k) * *(kf + k); 
            }
        }

        return;
}
示例#23
0
文件: colorout.c 项目: bgK/darktable
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;

  if(!isnan(d->cmatrix[0]))
  {
    //fprintf(stderr,"Using cmatrix codepath\n");
    // convert to rgb using matrix
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + ch*roi_in->width *j;
      float *out = (float*)ovoid + ch*roi_out->width*j;
      const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]);
  
      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2)))));

        _mm_stream_ps(out,t);
      }
    }
    _mm_sfence();
    // apply profile
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + ch*roi_in->width *j;
      float *out = (float*)ovoid + ch*roi_out->width*j;
  
      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        for(int i=0; i<3; i++) 
          if (d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    float *in  = (float*)ivoid;
    float *out = (float*)ovoid;
    const int rowsize=roi_out->width * 3;
    //fprintf(stderr,"Using xform codepath\n");

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(out, roi_out, in)
#endif
    for (int k=0; k<roi_out->height; k++)
    {
      float Lab[rowsize];
      float rgb[rowsize];

      const int m=(k*(roi_out->width*ch));
      for (int l=0; l<roi_out->width; l++)
      {
        int li=3*l,ii=ch*l;
        Lab[li+0] = in[m+ii+0];
        Lab[li+1] = in[m+ii+1];
        Lab[li+2] = in[m+ii+2];
      }

      cmsDoTransform (d->xform, Lab, rgb, roi_out->width);

      for (int l=0; l<roi_out->width; l++)
      {
        int oi=ch*l, ri=3*l;
        out[m+oi+0] = rgb[ri+0];
        out[m+oi+1] = rgb[ri+1];
        out[m+oi+2] = rgb[ri+2];
      }
    }
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
示例#24
0
    static void LinearScaleYUVToRGB32Row_SSE2(const uint8* y_buf,
            const uint8* u_buf,
            const uint8* v_buf,
            uint8* rgb_buf,
            int width,
            int source_dx) {
        __m128i xmm0, xmmY1, xmmY2;
        __m128  xmmY;
        uint8 u0, u1, v0, v1, y0, y1;
        uint32 uv_frac, y_frac, u, v, y;
        int x = 0;

        if (source_dx >= 0x20000) {
            x = 32768;
        }

        while(width >= 2) {
            u0 = u_buf[x >> 17];
            u1 = u_buf[(x >> 17) + 1];
            v0 = v_buf[x >> 17];
            v1 = v_buf[(x >> 17) + 1];
            y0 = y_buf[x >> 16];
            y1 = y_buf[(x >> 16) + 1];
            uv_frac = (x & 0x1fffe);
            y_frac = (x & 0xffff);
            u = (uv_frac * u1 + (uv_frac ^ 0x1fffe) * u0) >> 17;
            v = (uv_frac * v1 + (uv_frac ^ 0x1fffe) * v0) >> 17;
            y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
            x += source_dx;

            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);

            y0 = y_buf[x >> 16];
            y1 = y_buf[(x >> 16) + 1];
            y_frac = (x & 0xffff);
            y = (y_frac * y1 + (y_frac ^ 0xffff) * y0) >> 16;
            x += source_dx;

            xmmY2 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));
            xmmY2 = _mm_adds_epi16(xmmY2, xmm0);

            xmmY = _mm_shuffle_ps(_mm_castsi128_ps(xmmY1), _mm_castsi128_ps(xmmY2),
                                  0x44);
            xmmY1 = _mm_srai_epi16(_mm_castps_si128(xmmY), 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);

            _mm_storel_epi64(reinterpret_cast<__m128i*>(rgb_buf), xmmY1);
            rgb_buf += 8;
            width -= 2;
        }

        if (width) {
            u = u_buf[x >> 17];
            v = v_buf[x >> 17];
            y = y_buf[x >> 16];

            xmm0 = _mm_adds_epi16(_mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbU + 8 * u)),
                                  _mm_loadl_epi64(reinterpret_cast<__m128i*>(kCoefficientsRgbV + 8 * v)));
            xmmY1 = _mm_loadl_epi64(reinterpret_cast<__m128i*>(reinterpret_cast<uint8*>(kCoefficientsRgbY) + 8 * y));

            xmmY1 = _mm_adds_epi16(xmmY1, xmm0);
            xmmY1 = _mm_srai_epi16(xmmY1, 6);
            xmmY1 = _mm_packus_epi16(xmmY1, xmmY1);
            *reinterpret_cast<uint32*>(rgb_buf) = _mm_cvtsi128_si32(xmmY1);
        }
    }
示例#25
0
void BrushToolEdit::drawBlur(const QPoint &pt, float amount)
{
    Terrain *tip = tool->tip(pt).data();

    // compute affected rectangle
    QRect dirtyRect(pt, tip->size());
    dirtyRect = dirtyRect.intersected(QRect(QPoint(0, 0), terrain->size()));

    if (!dirtyRect.isValid()) {
        return;
    }

    edit->beginEdit(dirtyRect, terrain);

    QSize tSize = terrain->size();
    QSize tipSize = tip->size();
    QSize blurBufferSize(dirtyRect.width() + 6, dirtyRect.height() + 6);
    TemporaryBuffer<__m128> blurBuffer1(blurBufferSize.width() * blurBufferSize.height(), 16);
    TemporaryBuffer<__m128> blurBuffer2(blurBufferSize.width() * blurBufferSize.height(), 16);
    TemporaryBuffer<float> tipBuffer(blurBufferSize.width() * blurBufferSize.height(), 4);

    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        cy = std::max(std::min(cy, tSize.height() - 1), 0);
        for (int x = 0; x < blurBufferSize.width(); ++x) {
            int cx = x + dirtyRect.left() - 3;
            cx = std::max(std::min(cx, tSize.width() - 1), 0);

            quint32 color = terrain->color(cx, cy);
            auto colorMM = _mm_setr_epi32(color, 0, 0, 0);
            colorMM = _mm_unpacklo_epi8(colorMM, _mm_setzero_si128());
            colorMM = _mm_unpacklo_epi16(colorMM, _mm_setzero_si128());
            auto colorF = _mm_cvtepi32_ps(colorMM);
            _mm_store_ps(reinterpret_cast<float *>(blurBuffer1 + x + y * blurBufferSize.width()), colorF);
        }
    }
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        int cy = y + dirtyRect.top() - 3;
        int ty = cy - pt.y();
        if (ty >= 0 && ty < tipSize.height()) {
            for (int x = 0; x < blurBufferSize.width(); ++x) {
                int cx = x + dirtyRect.left() - 3;
                int tx = cx - pt.x();
                tipBuffer[x + y * blurBufferSize.width()] =
                        tx >= 0 && tx < tipSize.width() ?
                            tip->landform(tx, ty) * amount :
                            0.f;
            }
        } else {
            std::fill(&tipBuffer[y * blurBufferSize.width()],
                    &tipBuffer[(y + 1) * blurBufferSize.width()], 0.f);
        }
    }

    // apply horizontal blur
    for (int y = 0; y < blurBufferSize.height(); ++y) {
        __m128 *inBuf = blurBuffer1 + y * blurBufferSize.width();
        __m128 *outBuf = blurBuffer2 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 3; x < blurBufferSize.width() - 3; ++x) {
            float variance = varBuf[x];
            __m128 kernel = globalGaussianKernelTable.fetch(variance);

            // sample input
            __m128 p1 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x));
            p1 = _mm_add_ps(p1, p1);
            __m128 p2 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 1));
            p2 = _mm_add_ps(p2, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 1)));
            __m128 p3 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 2));
            p3 = _mm_add_ps(p3, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 2)));
            __m128 p4 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x + 3));
            p4 = _mm_add_ps(p4, _mm_load_ps(reinterpret_cast<float *>(inBuf + x - 3)));

            // apply kernel
            p1 = _mm_mul_ps(p1, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(0, 0, 0, 0)));
            p2 = _mm_mul_ps(p2, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(1, 1, 1, 1)));
            p3 = _mm_mul_ps(p3, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(2, 2, 2, 2)));
            p4 = _mm_mul_ps(p4, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(3, 3, 3, 3)));

            p1 = _mm_add_ps(p1, p2);
            p3 = _mm_add_ps(p3, p4);
            auto p = _mm_add_ps(p1, p3);

            // store
            _mm_store_ps(reinterpret_cast<float *>(outBuf + x), p);
        }
    }

    // apply vertical blur
    for (int y = 3; y < blurBufferSize.height() - 3; ++y) {
        __m128 *inBuf = blurBuffer2 + y * blurBufferSize.width();
        __m128 *outBuf = blurBuffer1 + y * blurBufferSize.width();
        float *varBuf = tipBuffer + y * blurBufferSize.width();
        for (int x = 3; x < blurBufferSize.width() - 3; x += 1) {
            // fetch kernel
            __m128 kernel = globalGaussianKernelTable.fetch(varBuf[x]);

            // load input
            __m128 p1 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x));
            p1 = _mm_add_ps(p1, p1);
            __m128 p2 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width()));
            p2 = _mm_add_ps(p2, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width())));
            __m128 p3 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width() * 2));
            p3 = _mm_add_ps(p3, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width() * 2)));
            __m128 p4 = _mm_load_ps(reinterpret_cast<float *>(inBuf + x - blurBufferSize.width() * 3));
            p4 = _mm_add_ps(p4, _mm_load_ps(reinterpret_cast<float *>(inBuf + x + blurBufferSize.width() * 3)));

            // apply kernel
            p1 = _mm_mul_ps(p1, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(0, 0, 0, 0)));
            p2 = _mm_mul_ps(p2, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(1, 1, 1, 1)));
            p3 = _mm_mul_ps(p3, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(2, 2, 2, 2)));
            p4 = _mm_mul_ps(p4, _mm_shuffle_ps(kernel, kernel, _MM_SHUFFLE(3, 3, 3, 3)));
            p1 = _mm_add_ps(p1, p2);
            p3 = _mm_add_ps(p3, p4);
            auto p = _mm_add_ps(p1, p3);

            // store
            _mm_store_ps(reinterpret_cast<float *>(outBuf + x), p);
        }
    }

    for (int y = 0; y < dirtyRect.height(); ++y) {
        __m128 *inBuf = blurBuffer1 + (y + 3) * blurBufferSize.width() + 3;
        for (int x = 0; x < dirtyRect.width(); ++x) {
            int cx = x + dirtyRect.left();
            int cy = y + dirtyRect.top();
            auto colorF = _mm_load_ps(reinterpret_cast<float *>(inBuf + x));
            colorF = _mm_add_ps(colorF, _mm_set1_ps(0.5f));
            colorF = _mm_add_ps(colorF, globalDitherSampler.getM128());
            auto colorMM = _mm_cvttps_epi32(colorF);
            colorMM = _mm_packs_epi32(colorMM, colorMM);
            colorMM = _mm_packus_epi16(colorMM, colorMM);
            _mm_store_ss(reinterpret_cast<float*>(&terrain->color(cx, cy)), _mm_castsi128_ps(colorMM));
        }
    }
    edit->endEdit(terrain);
}
示例#26
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m128 sum;
    __m128 sum_l             = _mm_setzero_ps();
    __m128 sum_r             = _mm_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m128 delta             = _mm_set1_ps((float)
                                           (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 4)
    {
        __m128 buf_l = _mm_loadu_ps(buffer_l + i);
        __m128 buf_r = _mm_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m128 deltas = _mm_load_ps(delta_table + i);
        __m128 _sinc  = _mm_add_ps(_mm_load_ps(phase_table + i),
                                   _mm_mul_ps(deltas, delta));
#else
        __m128 _sinc = _mm_load_ps(phase_table + i);
#endif
        sum_l        = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
        sum_r        = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
    }

    /* Them annoying shuffles.
     * sum_l = { l3, l2, l1, l0 }
     * sum_r = { r3, r2, r1, r0 }
     */

    sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
                                    _MM_SHUFFLE(1, 0, 1, 0)),
                     _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

    /* sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
     * sum   = { R1, R0, L1, L0 }
     */

    sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

    /* sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
     * sum   = { X,  R,  X,  L }
     */

    /* Store L */
    _mm_store_ss(out_buffer + 0, sum);

    /* movehl { X, R, X, L } == { X, R, X, R } */
    _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
void decomp_gamma2_minus( spinor_array src, halfspinor_array dst) 
{

  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm6;
  __m128 xmm7;
  __m128 xmm8;

  /* Swap upper and lower components */
  /* Compiler should spill, or use 64 bit extras */
  __m128 xmm9;
  __m128 xmm10;
  __m128 xmm11;

  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

#if 0

  /* Load up the spinors */
  xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]);
  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]);
  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]);

  xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]);
  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]);
  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]);

  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]);
  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]);
  xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]);

  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]);
  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]);
  xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]);
#endif
 
  /* Swap the lower components */
  xmm6 = _mm_shuffle_ps(xmm3, xmm3, 0xb1);
  xmm7 = _mm_shuffle_ps(xmm4, xmm4, 0xb1);
  xmm8 = _mm_shuffle_ps(xmm5, xmm5, 0xb1);

  xmm9 = _mm_xor_ps(xmm6, signs23.vector);
  xmm10 = _mm_xor_ps(xmm7, signs23.vector);
  xmm11 = _mm_xor_ps(xmm8, signs23.vector);

  /* Add */
  xmm0 = _mm_add_ps(xmm0, xmm9);
  xmm1 = _mm_add_ps(xmm1, xmm10);
  xmm2 = _mm_add_ps(xmm2, xmm11);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);

  

}
test (__m128 s1, __m128 s2)
{
  return _mm_shuffle_ps (s1, s2, MASK); 
}
void decomp_gamma3_minus( spinor_array src, halfspinor_array dst) 
{

  /* Space for upper components */
  __m128 xmm0;
  __m128 xmm1;
  __m128 xmm2;

  /* Space for lower components */
  __m128 xmm3;
  __m128 xmm4;
  __m128 xmm5;

  __m128 xmm6;
  __m128 xmm7;

  xmm0 = _mm_load_ps(&src[0][0][0]);
  xmm2 = _mm_load_ps(&src[0][2][0]);
  xmm6 = _mm_load_ps(&src[1][1][0]);
  
  xmm3 = _mm_load_ps(&src[2][0][0]);
  xmm5 = _mm_load_ps(&src[2][2][0]);
  xmm7 = _mm_load_ps(&src[3][1][0]);

  xmm1 = _mm_xor_ps(xmm1,xmm1); // This should zero 
  xmm4 = _mm_xor_ps(xmm4,xmm4);

  xmm1 = _mm_movelh_ps(xmm1,xmm6);
  xmm4 = _mm_movelh_ps(xmm4,xmm7);

  xmm1 = _mm_movehl_ps(xmm1, xmm0);
  xmm4 = _mm_movehl_ps(xmm4, xmm3);


  xmm0 = _mm_shuffle_ps(xmm0, xmm2, 0xe4);
  xmm3 = _mm_shuffle_ps(xmm3, xmm5, 0xe4);

  xmm2 = _mm_shuffle_ps(xmm2, xmm6, 0xe4);
  xmm5 = _mm_shuffle_ps(xmm5, xmm7, 0xe4);

#if 0
  /* Load up the spinors */
  xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&src[0][0][0]);
  xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&src[0][1][0]);
  xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&src[0][2][0]);

  xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&src[1][0][0]);
  xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&src[1][1][0]);
  xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&src[1][2][0]);

  xmm3 = _mm_loadl_pi(xmm3, (__m64 *)&src[2][0][0]);
  xmm4 = _mm_loadl_pi(xmm4, (__m64 *)&src[2][1][0]);
  xmm5 = _mm_loadl_pi(xmm5, (__m64 *)&src[2][2][0]);

  xmm3 = _mm_loadh_pi(xmm3, (__m64 *)&src[3][0][0]);
  xmm4 = _mm_loadh_pi(xmm4, (__m64 *)&src[3][1][0]);
  xmm5 = _mm_loadh_pi(xmm5, (__m64 *)&src[3][2][0]);
#endif
 
  /* sub */
  xmm0 = _mm_sub_ps(xmm0, xmm3);
  xmm1 = _mm_sub_ps(xmm1, xmm4);
  xmm2 = _mm_sub_ps(xmm2, xmm5);

  /* Store */
  _mm_store_ps(&dst[0][0][0],xmm0);
  _mm_store_ps(&dst[1][0][0],xmm1);
  _mm_store_ps(&dst[2][0][0],xmm2);

}
示例#30
0
/** 32 point butterfly (in place, 4 register) */
static void
mdct_butterfly_32_sse(FLOAT *x) {
    static _MM_ALIGN16 const float PFV0[4] = { -AFT_PI3_8, -AFT_PI1_8,
                                               -AFT_PI2_8, -AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV1[4] = { -AFT_PI1_8,  AFT_PI3_8,
                                               -AFT_PI2_8,  AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV2[4] = { -AFT_PI1_8, -AFT_PI3_8,
                                                     -1.f,        1.f };
    static _MM_ALIGN16 const float PFV3[4] = { -AFT_PI3_8,  AFT_PI1_8,
                                                      0.f,        0.f };
    static _MM_ALIGN16 const float PFV4[4] = {  AFT_PI3_8,  AFT_PI3_8,
                                                AFT_PI2_8,  AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV5[4] = { -AFT_PI1_8,  AFT_PI1_8,
                                               -AFT_PI2_8,  AFT_PI2_8 };
    static _MM_ALIGN16 const float PFV6[4] = {  AFT_PI1_8,  AFT_PI3_8,
                                                      1.f,        1.f };
    static _MM_ALIGN16 const float PFV7[4] = { -AFT_PI3_8,  AFT_PI1_8,
                                                      0.f,        0.f };
    __m128  XMM0, XMM1, XMM2, XMM3, XMM4, XMM5, XMM6, XMM7;

    XMM0     = _mm_load_ps(x+16);
    XMM1     = _mm_load_ps(x+20);
    XMM2     = _mm_load_ps(x+24);
    XMM3     = _mm_load_ps(x+28);
    XMM4     = XMM0;
    XMM5     = XMM1;
    XMM6     = XMM2;
    XMM7     = XMM3;

    XMM0     = _mm_sub_ps(XMM0, PM128(x   ));
    XMM1     = _mm_sub_ps(XMM1, PM128(x+ 4));
    XMM2     = _mm_sub_ps(XMM2, PM128(x+ 8));
    XMM3     = _mm_sub_ps(XMM3, PM128(x+12));
    XMM4     = _mm_add_ps(XMM4, PM128(x   ));
    XMM5     = _mm_add_ps(XMM5, PM128(x+ 4));
    XMM6     = _mm_add_ps(XMM6, PM128(x+ 8));
    XMM7     = _mm_add_ps(XMM7, PM128(x+12));
    _mm_store_ps(x+16, XMM4);
    _mm_store_ps(x+20, XMM5);
    _mm_store_ps(x+24, XMM6);
    _mm_store_ps(x+28, XMM7);

    XMM4     = XMM0;
    XMM5     = XMM1;
    XMM6     = XMM2;
    XMM7     = XMM3;

    XMM0     = _mm_shuffle_ps(XMM0, XMM0, _MM_SHUFFLE(3,3,1,1));
    XMM4     = _mm_shuffle_ps(XMM4, XMM4, _MM_SHUFFLE(2,2,0,0));

    XMM1     = _mm_shuffle_ps(XMM1, XMM1, _MM_SHUFFLE(2,3,1,1));
    XMM5     = _mm_shuffle_ps(XMM5, XMM5, _MM_SHUFFLE(2,3,0,0));
    XMM2     = _mm_shuffle_ps(XMM2, XMM2, _MM_SHUFFLE(2,2,1,0));
    XMM6     = _mm_shuffle_ps(XMM6, XMM6, _MM_SHUFFLE(3,3,0,1));
    XMM3     = _mm_shuffle_ps(XMM3, XMM3, _MM_SHUFFLE(3,2,0,0));
    XMM7     = _mm_shuffle_ps(XMM7, XMM7, _MM_SHUFFLE(3,2,1,1));
    XMM0     = _mm_mul_ps(XMM0, PM128(PFV0));
    XMM4     = _mm_mul_ps(XMM4, PM128(PFV1));
    XMM1     = _mm_mul_ps(XMM1, PM128(PFV2));
    XMM5     = _mm_mul_ps(XMM5, PM128(PFV3));
    XMM2     = _mm_mul_ps(XMM2, PM128(PFV4));
    XMM6     = _mm_mul_ps(XMM6, PM128(PFV5));
    XMM3     = _mm_mul_ps(XMM3, PM128(PFV6));
    XMM7     = _mm_mul_ps(XMM7, PM128(PFV7));
    XMM0     = _mm_add_ps(XMM0, XMM4);
    XMM1     = _mm_add_ps(XMM1, XMM5);
    XMM2     = _mm_add_ps(XMM2, XMM6);
    XMM3     = _mm_add_ps(XMM3, XMM7);
    _mm_store_ps(x   , XMM0);
    _mm_store_ps(x+ 4, XMM1);
    _mm_store_ps(x+ 8, XMM2);
    _mm_store_ps(x+12, XMM3);

    mdct_butterfly_16_sse(x);
    mdct_butterfly_16_sse(x+16);
}