static void GF_FUNC_ALIGN VS_CC
proc_8bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
               int height, int stride, uint8_t *dstp, const uint8_t *srcp)
{
    uint8_t *p0 = buff + 16;
    uint8_t *p1 = p0 + bstride;
    uint8_t *p2 = p1 + bstride;
    uint8_t *p3 = p2 + bstride;
    uint8_t *p4 = p3 + bstride;
    uint8_t *orig = p0, *end = p4;

    line_copy8(p0, srcp + 2 * stride, width, 2);
    line_copy8(p1, srcp + stride, width, 2);
    line_copy8(p2, srcp, width, 2);
    srcp += stride;
    line_copy8(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi16(all1, 15);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);
    
    __m128i matrix_h[5];
    __m128i matrix_v[5];
    for (int i = 0; i < 5; i++) {
        matrix_h[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_h[i]), zero);
        matrix_v[i] = _mm_unpacklo_epi16(_mm_set1_epi16((int16_t)ch->m_v[i]), zero);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy8(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 16) {
            uint8_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                __m128i sum[4];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();
                sum[2] = _mm_setzero_si128();
                sum[3] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);
                    xmm2 = _mm_unpackhi_epi8(xmm0, zero);
                    xmm0 = _mm_unpacklo_epi8(xmm0, zero);

                    xmm1 = _mm_unpackhi_epi16(xmm0, zero);
                    xmm0 = _mm_unpacklo_epi16(xmm0, zero);
                    sum[0] = _mm_add_epi32(sum[0], _mm_madd_epi16(xmm0, matrix[i]));
                    sum[1] = _mm_add_epi32(sum[1], _mm_madd_epi16(xmm1, matrix[i]));

                    xmm1 = _mm_unpackhi_epi16(xmm2, zero);
                    xmm0 = _mm_unpacklo_epi16(xmm2, zero);
                    sum[2] = _mm_add_epi32(sum[2], _mm_madd_epi16(xmm0, matrix[i]));
                    sum[3] = _mm_add_epi32(sum[3], _mm_madd_epi16(xmm1, matrix[i]));
                }

                for (int i = 0; i < 4; i++) {
                    __m128 sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, j == 0 ? rdiv_v : rdiv_h);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);
                }

                sum[0] = _mm_packs_epi32(sum[0], sum[1]);
                sum[1] = _mm_packs_epi32(sum[2], sum[3]);

                if (!ch->saturate) {
                    for (int i = 0; i < 2; i++) {
                        __m128i mask = _mm_cmplt_epi16(sum[i], zero);
                        __m128i temp = _mm_add_epi16(one, _mm_xor_si128(sum[i], all1));
                        temp = _mm_and_si128(temp, mask);
                        sum[i] = _mm_andnot_si128(mask, sum[i]);
                        sum[i] = _mm_or_si128(sum[i], temp);
                    }
                }

                sum[0] = _mm_packus_epi16(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}
Example #2
0
Vector4<float>&
Vector4<float>::addTo(const Vector4<float>& vector)
{
    return this->set(_mm_add_ps(this->asSSE(), vector.asSSE()));
}
// use MMX/SSE extensions
void dotprod_crcf_execute_mmx4(dotprod_crcf _q,
                               float complex * _x,
                               float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // first cut: ...
    __m128 v0, v1, v2, v3;  // input vectors
    __m128 h0, h1, h2, h3;  // coefficients vectors
    __m128 s0, s1, s2, s3;  // dot products [re, im, re, im]

    // load zeros into sum registers
    __m128 sum0 = _mm_setzero_ps();
    __m128 sum1 = _mm_setzero_ps();
    __m128 sum2 = _mm_setzero_ps();
    __m128 sum3 = _mm_setzero_ps();

    // r = 4*floor(n/16)
    unsigned int r = (n >> 4) << 2;

    //
    unsigned int i;
    for (i=0; i<r; i+=4) {
        // load inputs into register (unaligned)
        v0 = _mm_loadu_ps(&x[4*i+0]);
        v1 = _mm_loadu_ps(&x[4*i+4]);
        v2 = _mm_loadu_ps(&x[4*i+8]);
        v3 = _mm_loadu_ps(&x[4*i+12]);

        // load coefficients into register (aligned)
        h0 = _mm_load_ps(&_q->h[4*i+0]);
        h1 = _mm_load_ps(&_q->h[4*i+4]);
        h2 = _mm_load_ps(&_q->h[4*i+8]);
        h3 = _mm_load_ps(&_q->h[4*i+12]);

        // compute multiplication
        s0 = _mm_mul_ps(v0, h0);
        s1 = _mm_mul_ps(v1, h1);
        s2 = _mm_mul_ps(v2, h2);
        s3 = _mm_mul_ps(v3, h3);
        
        // parallel addition
        sum0 = _mm_add_ps( sum0, s0 );
        sum1 = _mm_add_ps( sum1, s1 );
        sum2 = _mm_add_ps( sum2, s2 );
        sum3 = _mm_add_ps( sum3, s3 );
    }

    // fold down
    sum0 = _mm_add_ps( sum0, sum1 );
    sum2 = _mm_add_ps( sum2, sum3 );
    sum0 = _mm_add_ps( sum0, sum2 );

    // aligned output array
    float w[4] __attribute__((aligned(16)));

    // unload packed array and perform manual sum
    _mm_store_ps(w, sum0);
    w[0] += w[2];
    w[1] += w[3];

    // cleanup (note: n _must_ be even)
    for (i=4*r; i<n; i+=2) {
        w[0] += x[i  ] * _q->h[i  ];
        w[1] += x[i+1] * _q->h[i+1];
    }

    // set return value
    *_y = w[0] + w[1]*_Complex_I;
}
Example #4
0
int SymmColumnVec_32f_Symm_AVX(const float** src, const float* ky, float* dst, float delta, int width, int ksize2)
{
    int i = 0, k;
    const float *S, *S2;
    const __m128 d4 = _mm_set1_ps(delta);
    const __m256 d8 = _mm256_set1_ps(delta);

    for( ; i <= width - 16; i += 16 )
    {
        __m256 f = _mm256_set1_ps(ky[0]);
        __m256 s0, s1;
        __m256 x0;
        S = src[0] + i;
        s0 = _mm256_loadu_ps(S);
#if CV_FMA3
        s0 = _mm256_fmadd_ps(s0, f, d8);
#else
        s0 = _mm256_add_ps(_mm256_mul_ps(s0, f), d8);
#endif
        s1 = _mm256_loadu_ps(S+8);
#if CV_FMA3
        s1 = _mm256_fmadd_ps(s1, f, d8);
#else
        s1 = _mm256_add_ps(_mm256_mul_ps(s1, f), d8);
#endif

        for( k = 1; k <= ksize2; k++ )
        {
            S = src[k] + i;
            S2 = src[-k] + i;
            f = _mm256_set1_ps(ky[k]);
            x0 = _mm256_add_ps(_mm256_loadu_ps(S), _mm256_loadu_ps(S2));
#if CV_FMA3
            s0 = _mm256_fmadd_ps(x0, f, s0);
#else
            s0 = _mm256_add_ps(s0, _mm256_mul_ps(x0, f));
#endif
            x0 = _mm256_add_ps(_mm256_loadu_ps(S+8), _mm256_loadu_ps(S2+8));
#if CV_FMA3
            s1 = _mm256_fmadd_ps(x0, f, s1);
#else
            s1 = _mm256_add_ps(s1, _mm256_mul_ps(x0, f));
#endif
        }

        _mm256_storeu_ps(dst + i, s0);
        _mm256_storeu_ps(dst + i + 8, s1);
    }

    for( ; i <= width - 4; i += 4 )
    {
        __m128 f = _mm_set1_ps(ky[0]);
        __m128 x0, s0 = _mm_load_ps(src[0] + i);
        s0 = _mm_add_ps(_mm_mul_ps(s0, f), d4);

        for( k = 1; k <= ksize2; k++ )
        {
            f = _mm_set1_ps(ky[k]);
            S = src[k] + i;
            S2 = src[-k] + i;
            x0 = _mm_add_ps(_mm_load_ps(src[k]+i), _mm_load_ps(src[-k] + i));
            s0 = _mm_add_ps(s0, _mm_mul_ps(x0, f));
        }

        _mm_storeu_ps(dst + i, s0);
    }

    _mm256_zeroupper();
    return i;
}
Example #5
0
			inline void Multiply(Matrix4x4<float> const &ma, Matrix4x4<float> &out) const {

#ifdef __SSE_AVAIL__
				__m128 m0 = _mm_load_ps(Row1);
				__m128 m1 = _mm_load_ps(Row2);
				__m128 m2 = _mm_load_ps(Row3);
				__m128 m3 = _mm_load_ps(Row4);

				__m128 brod1 = _mm_set1_ps(ma.Mat[0]);
				__m128 brod2 = _mm_set1_ps(ma.Mat[1]);
				__m128 brod3 = _mm_set1_ps(ma.Mat[2]);
				__m128 brod4 = _mm_set1_ps(ma.Mat[3]);
				__m128 row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
					//out.ssem1 = row;
				_mm_store_ps(out.Row1, row);

				brod1 = _mm_set1_ps(ma.Mat[4]);
				brod2 = _mm_set1_ps(ma.Mat[5]);
				brod3 = _mm_set1_ps(ma.Mat[6]);
				brod4 = _mm_set1_ps(ma.Mat[7]);
				row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(out.Row2, row);
				//out.ssem2 = row;

				brod1 = _mm_set1_ps(ma.Mat[8]);
				brod2 = _mm_set1_ps(ma.Mat[9]);
				brod3 = _mm_set1_ps(ma.Mat[10]);
				brod4 = _mm_set1_ps(ma.Mat[11]);
				row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(out.Row3, row);
				//out.ssem3 = row;

				brod1 = _mm_set1_ps(ma.Mat[12]);
				brod2 = _mm_set1_ps(ma.Mat[13]);
				brod3 = _mm_set1_ps(ma.Mat[14]);
				brod4 = _mm_set1_ps(ma.Mat[15]);
				row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(out.Row4, row);
				//out.ssem4 = row;
#else

#endif

			}
Example #6
0
/* Function:  p7_Null2_ByExpectation()
 * Synopsis:  Calculate null2 model from posterior probabilities.
 * Incept:    SRE, Mon Aug 18 08:32:55 2008 [Janelia]
 *
 * Purpose:   Identical to <p7_GNull2_ByExpectation()> except that
 *            <om>, <pp> are SSE optimized versions of the profile
 *            and the residue posterior probability matrix. See 
 *            <p7_GNull2_ByExpectation()>  documentation.
 *            
 * Args:      om    - profile, in any mode, target length model set to <L>
 *            pp    - posterior prob matrix, for <om> against domain envelope <dsq+i-1> (offset)
 *            null2 - RETURN: null2 log odds scores per residue; <0..Kp-1>; caller allocated space
 */
int
p7_Null2_ByExpectation(const P7_OPROFILE *om, const P7_OMX *pp, float *null2)
{
  int      M    = om->M;
  int      Ld   = pp->L;
  int      Q    = p7O_NQF(M);
  float   *xmx  = pp->xmx;	/* enables use of XMXo(i,s) macro */
  float    norm;
  __m128  *rp;
  __m128   sv;
  float    xfactor;
  int      i,q,x;
  
  /* Calculate expected # of times that each emitting state was used
   * in generating the Ld residues in this domain.
   * The 0 row in <wrk> is used to hold these numbers.
   */
  memcpy(pp->dpf[0], pp->dpf[1], sizeof(__m128) * 3 * Q);
  XMXo(0,p7X_N) = XMXo(1,p7X_N);
  XMXo(0,p7X_C) = XMXo(1,p7X_C); /* 0.0 */
  XMXo(0,p7X_J) = XMXo(1,p7X_J); /* 0.0 */

  for (i = 2; i <= Ld; i++)
    {
      for (q = 0; q < Q; q++)
	{
	  pp->dpf[0][q*3 + p7X_M] = _mm_add_ps(pp->dpf[i][q*3 + p7X_M], pp->dpf[0][q*3 + p7X_M]);
	  pp->dpf[0][q*3 + p7X_I] = _mm_add_ps(pp->dpf[i][q*3 + p7X_I], pp->dpf[0][q*3 + p7X_I]);
	}
      XMXo(0,p7X_N) += XMXo(i,p7X_N);
      XMXo(0,p7X_C) += XMXo(i,p7X_C); 
      XMXo(0,p7X_J) += XMXo(i,p7X_J); 
    }

  /* Convert those expected #'s to frequencies, to use as posterior weights. */
  norm = 1.0 / (float) Ld;
  sv   = _mm_set1_ps(norm);
  for (q = 0; q < Q; q++)
    {
      pp->dpf[0][q*3 + p7X_M] = _mm_mul_ps(pp->dpf[0][q*3 + p7X_M], sv);
      pp->dpf[0][q*3 + p7X_I] = _mm_mul_ps(pp->dpf[0][q*3 + p7X_I], sv);
    }
  XMXo(0,p7X_N) *= norm;
  XMXo(0,p7X_C) *= norm;
  XMXo(0,p7X_J) *= norm;

  /* Calculate null2's emission odds, by taking posterior weighted sum
   * over all emission vectors used in paths explaining the domain.
   */
  xfactor = XMXo(0, p7X_N) + XMXo(0, p7X_C) + XMXo(0, p7X_J); 
  for (x = 0; x < om->abc->K; x++)
    {
      sv = _mm_setzero_ps();
      rp = om->rfv[x];
      for (q = 0; q < Q; q++)
	{
	  sv = _mm_add_ps(sv, _mm_mul_ps(pp->dpf[0][q*3 + p7X_M], *rp)); rp++;
	  sv = _mm_add_ps(sv,            pp->dpf[0][q*3 + p7X_I]);              /* insert odds implicitly 1.0 */
	  //	  sv = _mm_add_ps(sv, _mm_mul_ps(pp->dpf[0][q*3 + p7X_I], *rp)); rp++; 
	}
      esl_sse_hsum_ps(sv, &(null2[x]));
      null2[x] += xfactor;
    }
  /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet,
   * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies
   * for this envelope.
   */

  /* make valid scores for all degeneracies, by averaging the odds ratios. */
  esl_abc_FAvgScVec(om->abc, null2);
  null2[om->abc->K]    = 1.0;        /* gap character    */
  null2[om->abc->Kp-2] = 1.0;	     /* nonresidue "*"   */
  null2[om->abc->Kp-1] = 1.0;	     /* missing data "~" */

  return eslOK;
}
Example #7
0
// 128 bits
inline vec4 operator+(vec4 a, vec4 b) { return _mm_add_ps(a, b); }
// use MMX/SSE extensions
//
// (a + jb)(c + jd) = (ac - bd) + j(ad + bc)
//
// mm_x  = { x[0].real, x[0].imag, x[1].real, x[1].imag }
// mm_hi = { h[0].real, h[0].real, h[1].real, h[1].real }
// mm_hq = { h[0].imag, h[0].imag, h[1].imag, h[1].imag }
//
// mm_y0 = mm_x * mm_hi
//       = { x[0].real * h[0].real,
//           x[0].imag * h[0].real,
//           x[1].real * h[1].real,
//           x[1].imag * h[1].real };
//
// mm_y1 = mm_x * mm_hq
//       = { x[0].real * h[0].imag,
//           x[0].imag * h[0].imag,
//           x[1].real * h[1].imag,
//           x[1].imag * h[1].imag };
//
void dotprod_cccf_execute_mmx(dotprod_cccf _q,
                              float complex * _x,
                              float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // temporary buffers
    __m128 v;   // input vector
    __m128 hi;  // coefficients vector (real)
    __m128 hq;  // coefficients vector (imag)
    __m128 ci;  // output multiplication (v * hi)
    __m128 cq;  // output multiplication (v * hq)

    // aligned output array
    float w[4] __attribute__((aligned(16))) = {0,0,0,0};

#if HAVE_PMMINTRIN_H
    // SSE3
    __m128 s;   // dot product
    __m128 sum = _mm_setzero_ps(); // load zeros into sum register
#else
    // no SSE3
    float wi[4] __attribute__((aligned(16)));
    float wq[4] __attribute__((aligned(16)));
#endif

    // t = 4*(floor(_n/4))
    unsigned int t = (n >> 2) << 2;

    //
    unsigned int i;
    for (i=0; i<t; i+=4) {
        // load inputs into register (unaligned)
        // {x[0].real, x[0].imag, x[1].real, x[1].imag}
        v = _mm_loadu_ps(&x[i]);

        // load coefficients into register (aligned)
        hi = _mm_load_ps(&_q->hi[i]);
        hq = _mm_load_ps(&_q->hq[i]);

        // compute parallel multiplications
        ci = _mm_mul_ps(v, hi);
        cq = _mm_mul_ps(v, hq);

        // shuffle values
        cq = _mm_shuffle_ps( cq, cq, _MM_SHUFFLE(2,3,0,1) );
        
#if HAVE_PMMINTRIN_H
        // SSE3: combine using addsub_ps()
        s = _mm_addsub_ps( ci, cq );

        // accumulate
        sum = _mm_add_ps(sum, s);
#else
        // no SSE3: combine using slow method
        // FIXME: implement slow method
        // unload values
        _mm_store_ps(wi, ci);
        _mm_store_ps(wq, cq);

        // accumulate
        w[0] += wi[0] - wq[0];
        w[1] += wi[1] + wq[1];
        w[2] += wi[2] - wq[2];
        w[3] += wi[3] + wq[3];
#endif
    }

#if HAVE_PMMINTRIN_H
    // unload packed array
    _mm_store_ps(w, sum);
#endif

    // add in-phase and quadrature components
    w[0] += w[2];   // I
    w[1] += w[3];   // Q

    //float complex total = *((float complex*)w);
    float complex total = w[0] + w[1] * _Complex_I;

    // cleanup
    for (i=t/2; i<_q->n; i++)
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );

    // set return value
    *_y = total;
}
// use MMX/SSE extensions
void dotprod_cccf_execute_mmx4(dotprod_cccf _q,
                               float complex * _x,
                               float complex * _y)
{
    // type cast input as floating point array
    float * x = (float*) _x;

    // double effective length
    unsigned int n = 2*_q->n;

    // first cut: ...
    __m128 v0,  v1,  v2,  v3;   // input vectors
    __m128 hi0, hi1, hi2, hi3;  // coefficients vectors (real)
    __m128 hq0, hq1, hq2, hq3;  // coefficients vectors (imag)
    __m128 ci0, ci1, ci2, ci3;  // output multiplications (v * hi)
    __m128 cq0, cq1, cq2, cq3;  // output multiplications (v * hq)

    // load zeros into sum registers
    __m128 sumi = _mm_setzero_ps();
    __m128 sumq = _mm_setzero_ps();

    // r = 4*floor(n/16)
    unsigned int r = (n >> 4) << 2;

    //
    unsigned int i;
    for (i=0; i<r; i+=4) {
        // load inputs into register (unaligned)
        v0 = _mm_loadu_ps(&x[4*i+0]);
        v1 = _mm_loadu_ps(&x[4*i+4]);
        v2 = _mm_loadu_ps(&x[4*i+8]);
        v3 = _mm_loadu_ps(&x[4*i+12]);

        // load real coefficients into registers (aligned)
        hi0 = _mm_load_ps(&_q->hi[4*i+0]);
        hi1 = _mm_load_ps(&_q->hi[4*i+4]);
        hi2 = _mm_load_ps(&_q->hi[4*i+8]);
        hi3 = _mm_load_ps(&_q->hi[4*i+12]);

        // load real coefficients into registers (aligned)
        hq0 = _mm_load_ps(&_q->hq[4*i+0]);
        hq1 = _mm_load_ps(&_q->hq[4*i+4]);
        hq2 = _mm_load_ps(&_q->hq[4*i+8]);
        hq3 = _mm_load_ps(&_q->hq[4*i+12]);
        
        // compute parallel multiplications (real)
        ci0 = _mm_mul_ps(v0, hi0);
        ci1 = _mm_mul_ps(v1, hi1);
        ci2 = _mm_mul_ps(v2, hi2);
        ci3 = _mm_mul_ps(v3, hi3);

        // compute parallel multiplications (imag)
        cq0 = _mm_mul_ps(v0, hq0);
        cq1 = _mm_mul_ps(v1, hq1);
        cq2 = _mm_mul_ps(v2, hq2);
        cq3 = _mm_mul_ps(v3, hq3);

        // accumulate
        sumi = _mm_add_ps(sumi, ci0);   sumq = _mm_add_ps(sumq, cq0);
        sumi = _mm_add_ps(sumi, ci1);   sumq = _mm_add_ps(sumq, cq1);
        sumi = _mm_add_ps(sumi, ci2);   sumq = _mm_add_ps(sumq, cq2);
        sumi = _mm_add_ps(sumi, ci3);   sumq = _mm_add_ps(sumq, cq3);
    }

    // shuffle values
    sumq = _mm_shuffle_ps( sumq, sumq, _MM_SHUFFLE(2,3,0,1) );

    // unload
    float wi[4] __attribute__((aligned(16)));
    float wq[4] __attribute__((aligned(16)));
    _mm_store_ps(wi, sumi);
    _mm_store_ps(wq, sumq);

    // fold down (add/sub)
    float complex total = 
        ((wi[0] - wq[0]) + (wi[2] - wq[2])) +
        ((wi[1] + wq[1]) + (wi[3] + wq[3])) * _Complex_I;

    // cleanup (note: n _must_ be even)
    // TODO : clean this method up
    for (i=2*r; i<_q->n; i++) {
        total += _x[i] * ( _q->hi[2*i] + _q->hq[2*i]*_Complex_I );
    }

    // set return value
    *_y = total;
}
Example #10
0
// ---------- local operators and operator-wrappers ----------
UNUSED static inline __m128
local_add_ps ( __m128 in1, __m128 in2 )
{
    return _mm_add_ps ( in1, in2 );
}
Example #11
0
static void display_sse2(int width, int height, float xmin, float xmax, float ymin, float ymax, int yofs, int ylim)
{
    int x, y;
    int xpos, ypos;
    float xscal = (xmax - xmin) / width;
    float yscal = (ymax - ymin) / height;

    unsigned counts[4];

#if (OPTIMIZED) && 0
    __m128 ci = (__m128){ ymin, ymin, ymin, ymin };
    __m128 di = (__m128){ yscal, yscal, yscal, yscal };
#if 0
    v4sf ci = { ymin,
                ymin,
                ymin,
                ymin };
    v4sf di = { yscal, yscal, yscal, yscal };
#endif
#endif

    for (y = yofs; y < ylim; y++) {
	{
            for (ypos = 0 ; ypos < height ; ypos++) {
                for (xpos = 0; xpos < width; xpos += 4) {
                    {
                        v4sf cr = { xmin + xpos * xscal,
                                    xmin + (xpos + 1) * xscal,
                                    xmin + (xpos + 2) * xscal,
                                    xmin + (xpos + 3) * xscal };
#if (!OPTIMIZED) || 1
                        v4sf ci = { ymin + y * yscal,
                                    ymin + y * yscal,
                                    ymin + y * yscal,
                                    ymin + y * yscal };
#endif

                        mandel_sse(cr, (v4sf)ci, counts);

                        ((unsigned *) g_x11.bitmap->data)[xpos + y * width] = cols[counts[0]];
                        ((unsigned *) g_x11.bitmap->data)[xpos + 1 + y * width] = cols[counts[1]];
                        ((unsigned *) g_x11.bitmap->data)[xpos + 2 + y * width] = cols[counts[2]];
                        ((unsigned *) g_x11.bitmap->data)[xpos + 3 + y * width] = cols[counts[3]];
                    }

                    /* Display it line-by-line for speed */
                    XPutImage(g_x11.dpy, g_x11.win, g_x11.gc, g_x11.bitmap,
                              0, y, 0, y,
                              width, 1);
                }
#if (OPTIMIZED) && 0
                ci = _mm_add_ps(ci, di);
#endif
            }
        }

        XFlush(g_x11.dpy);
    }

    return;
}
Example #12
0
static inline void   sacEvaluateModelSPRT(PROSAC_HEST* p){
	unsigned i;
	unsigned isInlier;
	double   lambda       = 1.0;
	double   lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon));
	double   lambdaAccept = ((   p->delta   ) / (    p->epsilon  ));
	float    distSq = p->maxD*p->maxD;
	float*   src = (float*)p->src;
	float*   dst = (float*)p->dst;
	float*   H   = p->H;
	
	
	p->inl      = 0;
	p->N_tested = 0;
	p->good     = 1;
	
	
	/* VECTOR */
	const __m128 distSqV=_mm_set1_ps(distSq);
	
	const __m128 H00=_mm_set1_ps(H[0]);
	const __m128 H01=_mm_set1_ps(H[1]);
	const __m128 H02=_mm_set1_ps(H[2]);
	const __m128 H10=_mm_set1_ps(H[4]);
	const __m128 H11=_mm_set1_ps(H[5]);
	const __m128 H12=_mm_set1_ps(H[6]);
	const __m128 H20=_mm_set1_ps(H[8]);
	const __m128 H21=_mm_set1_ps(H[9]);
	const __m128 H22=_mm_set1_ps(H[10]);
	
	for(i=0;i<(p->N-3) && p->good;i+=4){
		/* Backproject */
		__m128 x, y, X, Y, inter0, inter1, inter2, inter3;
		x=_mm_load_ps(src+2*i);
		y=_mm_load_ps(src+2*i+4);
		X=_mm_load_ps(dst+2*i);
		Y=_mm_load_ps(dst+2*i+4);
		
		inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0
		inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2
		inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0
		inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2
		
		x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1)));
		X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3)));
		
		__m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02);
		__m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12);
		__m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22);
		
		__m128 recipZ = _mm_rcp_ps(reprojZ);
		reprojX = _mm_mul_ps(reprojX, recipZ);
		reprojY = _mm_mul_ps(reprojY, recipZ);
		//reprojX = _mm_div_ps(reprojX, reprojZ);
		//reprojY = _mm_div_ps(reprojY, reprojZ);
		
		reprojX = _mm_sub_ps(reprojX, X);
		reprojY = _mm_sub_ps(reprojY, Y);
		
		reprojX = _mm_mul_ps(reprojX, reprojX);
		reprojY = _mm_mul_ps(reprojY, reprojY);
		
		__m128 reprojDistV = _mm_add_ps(reprojX, reprojY);
		
		__m128 cmp = _mm_cmple_ps(reprojDistV, distSqV);
		int msk = _mm_movemask_ps(cmp);
		
		/* ... */
		/*                   0  1  2  3  4  5  6  7  8  9 10 11 12 13 14 15*/
		unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4};
		p->inl     += bitCnt[msk];
		
		
		/* SPRT */
		lambda *= p->lambdaTBL[msk];
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	/* SCALAR */
	for(;i<p->N && p->good;i++){
		/* Backproject */
		float x=src[i*2],y=src[i*2+1];
		float X=dst[i*2],Y=dst[i*2+1];
		
		float reprojX=H[0]*x+H[1]*y+H[2]; //  ( X_1 )     ( H_11 H_12    H_13  ) (x_1)
		float reprojY=H[4]*x+H[5]*y+H[6]; //  ( X_2 )  =  ( H_21 H_22    H_23  ) (x_2)
		float reprojZ=H[8]*x+H[9]*y+H[10];//  ( X_3 )     ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0)
		
		//reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z.
		reprojX/=reprojZ;
		reprojY/=reprojZ;
		
		//Compute distance
		reprojX-=X;
		reprojY-=Y;
		reprojX*=reprojX;
		reprojY*=reprojY;
		float reprojDist = reprojX+reprojY;
		
		/* ... */
		isInlier    = reprojDist <= distSq;
		p->inl     += isInlier;
		
		
		/* SPRT */
		lambda *= isInlier ? lambdaAccept : lambdaReject;
		p->good = lambda <= p->A;
		/* If !p->good, the threshold A was exceeded, so we're rejecting */
	}
	
	
	p->N_tested = i;
}
Example #13
0
static inline int    sacIsSampleDegenerate(PROSAC_HEST* p){
	unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3];
	
	/**
	 * Pack the matches selected by the SAC algorithm.
	 * Must be packed  points[0:7]  = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3}
	 *                 points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3}
	 * Gather 4 points into the vector
	 */
	
	__m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]);
	src10        = _mm_loadh_pi(src10, (__m64*)&p->src[i1]);
	__m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]);
	src32        = _mm_loadh_pi(src32, (__m64*)&p->src[i3]);
	__m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]);
	dst10        = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]);
	__m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]);
	dst32        = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]);
	
	
	/**
	 * If the matches' source points have common x and y coordinates, abort.
	 */
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[2].x
	 * packedPoints[0].y == packedPoints[2].y
	 * packedPoints[1].x == packedPoints[3].x
	 * packedPoints[1].y == packedPoints[3].y
	 */
	
	__m128 chkEq0 = _mm_cmpeq_ps(src10, src32);
	
	/**
	 * Check:
	 * packedPoints[1].x == packedPoints[2].x
	 * packedPoints[1].y == packedPoints[2].y
	 * packedPoints[0].x == packedPoints[3].x
	 * packedPoints[0].y == packedPoints[3].y
	 */
	
	__m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32);
	
	/**
	 * Check:
	 * packedPoints[0].x == packedPoints[1].x
	 * packedPoints[0].y == packedPoints[1].y
	 * packedPoints[2].x == packedPoints[3].x
	 * packedPoints[2].y == packedPoints[3].y
	 */
	
	__m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)),
	                             _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2)));
	
	/* Verify */
	if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){
		return 1;
	}
	
	/* If the matches do not satisfy the strong geometric constraint, abort. */
	
	/**
	 * p6420x   = (p6.x, p4.x, p2.x, p0.x)
	 * p6420y   = (p6.y, p4.y, p2.y, p0.y)
	 * p7531x   = (p7.x, p5.x, p3.x, p1.x)
	 * p7531y   = (p7.y, p5.y, p3.y, p1.y)
	 * crosssd0 = p6420y - p7531y                     = (cross2d0, cross0d0, cross2s0, cross0s0)
	 * crosssd1 = p7531x - p6420x                     = (cross2d1, cross0d1, cross2s1, cross0s1)
	 * crosssd2 = p6420x * p7531y  -  p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2)
	 * 
	 * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0)
	 * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1)
	 * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2)
	 * 
	 * dotsd0   = shufcrosssd0 * p6420x +
	 *            shufcrosssd1 * p6420y + 
	 *            shufcrosssd2
	 *          = (dotd0, dotd2, dots0, dots2)
	 * dotsd1   = shufcrosssd0 * p7531x +
	 *            shufcrosssd1 * p7531y + 
	 *            shufcrosssd2
	 *          = (dotd1, dotd3, dots1, dots3)
	 * 
	 * dots     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0))
	 * dotd     = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2))
	 *            movmaskps(dots ^ dotd)
	 */
	
	__m128 p3210x       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p3210y       = _mm_shuffle_ps(src10,  src32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7654x       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7654y       = _mm_shuffle_ps(dst10,  dst32,  _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p6420x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p6420y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0));
	__m128 p7531x       = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1));
	__m128 p7531y       = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1));
	
	__m128 crosssd0     = _mm_sub_ps(p6420y, p7531y);
	__m128 crosssd1     = _mm_sub_ps(p7531x, p6420x);
	__m128 crosssd2     = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x));
	
	__m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1));
	__m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1));
	
	__m128 dotsd0       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x),
	                                            _mm_mul_ps(shufcrosssd1, p6420y)),
	                                 shufcrosssd2);
	__m128 dotsd1       = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x),
	                                            _mm_mul_ps(shufcrosssd1, p7531y)),
	                                 shufcrosssd2);
	
	__m128 dots         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1));
	__m128 dotd         = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3));
	
	//if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){
	if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){
		return 1;
	}
	
	
	/* Otherwise, proceed with evaluation */
	_mm_store_ps((float*)&p->pkdPts[0], src10);
	_mm_store_ps((float*)&p->pkdPts[2], src32);
	_mm_store_ps((float*)&p->pkdPts[4], dst10);
	_mm_store_ps((float*)&p->pkdPts[6], dst32);
	
	return 0;
}
Example #14
0
double mscore_tandem::dot(unsigned long *_v)
{
	float fScore = 0.0;
	unsigned long a = 0;
	unsigned long lCount = 0;
	long lType = 0;
	vector<MIType>::iterator itType = m_vmiType[m_lId].begin();
	// tType and tTypeSize were added in 2006.09.01 to correct a problem
	// created by VC++ 2005. This new version uses a strict bounds checking
	// style for STL iterators, that cause a run time error if an iterator
	// longer than .end() is produced by incrementing the iterator.
	size_t tType = 0;
	const size_t tTypeSize = m_vmiType[m_lId].size();
	unsigned long *pType = NULL;
	if(!m_pplType[m_lId])	{
		m_pplType[m_lId] = new unsigned long[tTypeSize];
		pType = m_pplType[m_lId];
		while(tType < tTypeSize)	{
			pType[tType] = itType->m_lM;
			++itType;
			tType++;
		}
	}
	else	{
		pType = m_pplType[m_lId];
	}
	itType = m_vmiType[m_lId].begin();
	tType = 0;
	size_t tStep = (size_t)(0.5 + (double)tTypeSize/(double)m_lCount);
	if(tStep < 1)	{
		tStep = 1;
	}
	unsigned long lSeq = m_plSeq[a];
	while(lSeq != 0 && tType != tTypeSize)	{
		lType = 0;
		if(*pType < lSeq)	{
			lType = 1;
			// Generally lots more spectrum peaks than sequence peaks.  Trying
			// large steps first helps reduce performance hit for this operation.
			// This is were the iterator bounds checking failed in VC++ 2005.
			// By checking the size first, the iterator is not evaluated and
			// does not produce the failure.
			while (tType+tStep < tTypeSize && *(pType+tStep) < lSeq) {
				tType += tStep;
				pType += tStep;
			}
			do {
				tType++;
				pType++;
			} while(tType < tTypeSize && *(pType) < lSeq);
		}
		else if(*pType > lSeq)	{
			do {
				a++;
				lSeq = m_plSeq[a];
			} while(*pType > lSeq && lSeq != 0);
		}
		if(lSeq == 0 || tType == tTypeSize)	{
			break;
		}
		if(*pType == lSeq)	{
//			if((itType+tType)->m_fI > 0.0 && m_pfSeq[a] > 0.0)	{
				m_pafI[lCount] = (itType+tType)->m_fI;
				m_pafSeq[lCount] = m_pfSeq[a];
				lCount++;
//			}
		}
		if(lType)	{
			a++;
			lSeq = m_plSeq[a];
		}
		else	{
			tType++;
			pType++;
		}
	}
	*_v = lCount;
	if(lCount == 0)	{
		return fScore;
	}
#ifdef MSVC
	if(!m_uiSimd)	{
		for(a = 0; a < lCount; a++)	{
			fScore += m_pafI[a] * m_pafSeq[a];
		}
		return fScore;
	}
	if(lCount < 5)	{
		for(a = 0; a < lCount; a++)	{
			fScore += m_pafI[a] * m_pafSeq[a];
		}
		return fScore;
	}
	// align the arrays to 4 float boundary
	for(a = 0; lCount % 4 != 0; a++)	{
		m_pafI[lCount] = 0.0;
		m_pafSeq[lCount] = 0.0;
		lCount++;
	}
	// create compatible pointers
	__m128* pSum = (__m128*) m_pafSum;
	__m128* pI = (__m128*) m_pafI;
	__m128* pS = (__m128*) m_pafSeq;
	__m128 Sum = _mm_set1_ps(0.0);
	__m128 S1 = _mm_set1_ps(0.0);
	unsigned int n = lCount/4;
	// perform intrinsic calls for SIMD registers

	for(a = 0; a < n; a++)	{
		S1 = _mm_mul_ps(*pI,*pS);
		Sum = _mm_add_ps(Sum,S1);
		pI++;
		pS++;
	}
	m_um128.m = Sum;
	fScore = m_um128.f[0] + m_um128.f[1] + m_um128.f[2] + m_um128.f[3];
#else
	for(a = 0; a < lCount; a++)	{
		fScore += m_pafI[a] * m_pafSeq[a];
	}
#endif
	return (fScore);
}
Example #15
0
void test(__m128 *c) {
*b = _mm_add_ps(a[0],c[0]);
//printf( "%f,%f,%f,%f\n", b[0], b[0].m128_f32[1], b[0].m128_f32[2], b[0].m128_f32[3] );
}
Example #16
0
void __hv_biquad_f_win32(SignalBiquad *o, hv_bInf_t *_bIn, hv_bInf_t *_bX0, hv_bInf_t *_bX1, hv_bInf_t *_bX2, hv_bInf_t *_bY1, hv_bInf_t *_bY2, hv_bOutf_t bOut) {
  hv_bInf_t bIn = *_bIn;
  hv_bInf_t bX0 = *_bX0;
  hv_bInf_t bX1 = *_bX1;
  hv_bInf_t bX2 = *_bX2;
  hv_bInf_t bY1 = *_bY1;
  hv_bInf_t bY2 = *_bY2;
#else
void __hv_biquad_f(SignalBiquad *o, hv_bInf_t bIn, hv_bInf_t bX0, hv_bInf_t bX1, hv_bInf_t bX2, hv_bInf_t bY1, hv_bInf_t bY2, hv_bOutf_t bOut) {
#endif
#if HV_SIMD_AVX
  __m256 a = _mm256_mul_ps(bIn, bX0);
  __m256 b = _mm256_mul_ps(o->xm1, bX1);
  __m256 c = _mm256_mul_ps(o->xm2, bX2);
  __m256 d = _mm256_add_ps(a, b);
  __m256 e = _mm256_add_ps(c, d); // bIn*bX0 + o->x1*bX1 + o->x2*bX2
  float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0];
  float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1];
  float y2 = e[2] - y1*bY1[2] - y0*bY2[2];
  float y3 = e[3] - y2*bY1[3] - y1*bY2[3];
  float y4 = e[4] - y3*bY1[4] - y2*bY2[4];
  float y5 = e[5] - y4*bY1[5] - y3*bY2[5];
  float y6 = e[6] - y5*bY1[6] - y4*bY2[6];
  float y7 = e[7] - y6*bY1[7] - y5*bY2[7];

  o->xm2 = o->xm1;
  o->xm1 = bIn;
  o->ym1 = y7;
  o->ym2 = y6;

  *bOut = _mm256_set_ps(y7, y6, y5, y4, y3, y2, y1, y0);
#elif HV_SIMD_SSE
  __m128 a = _mm_mul_ps(bIn, bX0);
  __m128 b = _mm_mul_ps(o->xm1, bX1);
  __m128 c = _mm_mul_ps(o->xm2, bX2);
  __m128 d = _mm_add_ps(a, b);
  __m128 e = _mm_add_ps(c, d);

  const float *const bbe = (float *) &e;
  const float *const bbY1 = (float *) &bY1;
  const float *const bbY2 = (float *) &bY2;

  float y0 = bbe[0] - o->ym1*bbY1[0] - o->ym2*bbY2[0];
  float y1 = bbe[1] - y0*bbY1[1] - o->ym1*bbY2[1];
  float y2 = bbe[2] - y1*bbY1[2] - y0*bbY2[2];
  float y3 = bbe[3] - y2*bbY1[3] - y1*bbY2[3];

  o->xm2 = o->xm1;
  o->xm1 = bIn;
  o->ym1 = y3;
  o->ym2 = y2;

  *bOut = _mm_set_ps(y3, y2, y1, y0);
#elif HV_SIMD_NEON
  float32x4_t a = vmulq_f32(bIn, bX0);
  float32x4_t b = vmulq_f32(o->xm1, bX1);
  float32x4_t c = vmulq_f32(o->xm2, bX2);
  float32x4_t d = vaddq_f32(a, b);
  float32x4_t e = vaddq_f32(c, d);
  float y0 = e[0] - o->ym1*bY1[0] - o->ym2*bY2[0];
  float y1 = e[1] - y0*bY1[1] - o->ym1*bY2[1];
  float y2 = e[2] - y1*bY1[2] - y0*bY2[2];
  float y3 = e[3] - y2*bY1[3] - y1*bY2[3];

  o->xm2 = o->xm1;
  o->xm1 = bIn;
  o->ym1 = y3;
  o->ym2 = y2;

  *bOut = (float32x4_t) {y0, y1, y2, y3};
#else
  const float y = bIn*bX0 + o->xm1*bX1 + o->xm2*bX2 - o->ym1*bY1 - o->ym2*bY2;
  o->xm2 = o->xm1; o->xm1 = bIn;
  o->ym2 = o->ym1; o->ym1 = y;
  *bOut = y;
#endif
}
Example #17
0
/* Function:  p7_Null2_ByTrace()
 * Synopsis:  Assign null2 scores to an envelope by the sampling method.
 * Incept:    SRE, Mon Aug 18 10:22:49 2008 [Janelia]
 *
 * Purpose:   Identical to <p7_GNull2_ByTrace()> except that
 *            <om>, <wrk> are SSE optimized versions of the profile
 *            and the residue posterior probability matrix. See 
 *            <p7_GNull2_ByTrace()>  documentation.
 */
int
p7_Null2_ByTrace(const P7_OPROFILE *om, const P7_TRACE *tr, int zstart, int zend, P7_OMX *wrk, float *null2)
{
  union { __m128 v; float p[4]; } u;
  int    Q  = p7O_NQF(om->M);
  int    Ld = 0;
  float *xmx = wrk->xmx;	/* enables use of XMXo macro */
  float  norm;
  float  xfactor;
  __m128 sv;
  __m128 *rp;
  int    q, r, s;
  int    x;
  int    z;

  /* We'll use the i=0 row in wrk for working space: dp[0][] and xmx[][0]. */
  for (q = 0; q < Q; q++)
    {
      wrk->dpf[0][q*3 + p7X_M] = _mm_setzero_ps();
      wrk->dpf[0][q*3 + p7X_I] = _mm_setzero_ps();
    }
  XMXo(0,p7X_N) =  0.0;
  XMXo(0,p7X_C) =  0.0;
  XMXo(0,p7X_J) =  0.0;

  /* Calculate emitting state usage in this particular trace segment */
  for (z = zstart; z <= zend; z++)
    {
      if (tr->i[z] == 0) continue; /* quick test for whether this trace elem emitted or not */
      Ld++;
      if (tr->k[z] > 0)	/* must be an M or I */
	{ /* surely there's an easier way? but our workspace is striped, interleaved quads... */
	  s = ( (tr->st[z] == p7T_M) ?  p7X_M : p7X_I);
	  q = p7X_NSCELLS * ( (tr->k[z] - 1) % Q) + p7X_M;
	  r = (tr->k[z] - 1) / Q;
	  u.v            = wrk->dpf[0][q];
	  u.p[r]        += 1.0;	/* all this to increment a count by one! */
	  wrk->dpf[0][q] = u.v;

	}
      else /* emitted an x_i with no k; must be an N,C,J */
	{
	  switch (tr->st[z]) {
	  case p7T_N: XMXo(0,p7X_N) += 1.0; break;
	  case p7T_C: XMXo(0,p7X_C) += 1.0; break;
	  case p7T_J: XMXo(0,p7X_J) += 1.0; break;
	  }
	}
    }
  norm = 1.0 / (float) Ld;
  sv = _mm_set1_ps(norm);
  for (q = 0; q < Q; q++)
    {
      wrk->dpf[0][q*3 + p7X_M] = _mm_mul_ps(wrk->dpf[0][q*3 + p7X_M], sv);
      wrk->dpf[0][q*3 + p7X_I] = _mm_mul_ps(wrk->dpf[0][q*3 + p7X_I], sv);
    }
  XMXo(0,p7X_N) *= norm;
  XMXo(0,p7X_C) *= norm;
  XMXo(0,p7X_J) *= norm;

  /* Calculate null2's emission odds, by taking posterior weighted sum
   * over all emission vectors used in paths explaining the domain.
   */
  xfactor =  XMXo(0,p7X_N) + XMXo(0,p7X_C) + XMXo(0,p7X_J);
  for (x = 0; x < om->abc->K; x++)
    {
      sv = _mm_setzero_ps();
      rp = om->rfv[x];
      for (q = 0; q < Q; q++)
	{
	  sv = _mm_add_ps(sv, _mm_mul_ps(wrk->dpf[0][q*3 + p7X_M], *rp)); rp++;
	  sv = _mm_add_ps(sv,            wrk->dpf[0][q*3 + p7X_I]); /* insert emission odds implicitly 1.0 */
	  //	  sv = _mm_add_ps(sv, _mm_mul_ps(wrk->dpf[0][q*3 + p7X_I], *rp)); rp++;
	}
      esl_sse_hsum_ps(sv, &(null2[x]));
      null2[x] += xfactor;
    }
  /* now null2[x] = \frac{f_d(x)}{f_0(x)} for all x in alphabet,
   * 0..K-1, where f_d(x) are the ad hoc "null2" residue frequencies
   * for this envelope.
   */

  /* make valid scores for all degeneracies, by averaging the odds ratios. */
  esl_abc_FAvgScVec(om->abc, null2);
  null2[om->abc->K]    = 1.0;        /* gap character    */
  null2[om->abc->Kp-2] = 1.0;	     /* nonresidue "*"   */
  null2[om->abc->Kp-1] = 1.0;	     /* missing data "~" */

  return eslOK;
}
void sgemm( int m, int n, int d, float *A, float *C )
{
    int n1 = n+1, nEnd = n/VERTICAL_ROLL*VERTICAL_ROLL;
    float *B = A, *D = C;
	#pragma omp parallel for
	 for (int j = 0; j < n; j++) {
		int jn1 = j*(n+1), jn = j*n; float *Cjn = D+jn;
		// for (int b = 0; b < m; b+= BLOCKSIZE) {
			for (int i = 0; i < nEnd; i+=VERTICAL_ROLL) {
			    float *Cjni = Cjn+i;
			    float *Cjni1 = Cjni + 4;
			    float *Cjni2 = Cjni + 8;
			    float *Cjni3 = Cjni + 12;
			    float *Cjni4 = Cjni + 16;
			    float *Cjni5 = Cjni + 20;
			    float *Cjni6 = Cjni + 24;
			    float *Cjni7 = Cjni + 28;

			    int i1 = i+4;
			    int i2 = i+8;
			    int i3 = i+12;
			    int i4 = i+16;
			    int i5 = i+20;
			    int i6 = i+24;
			    int i7 = i+28;

			    __m128 Cij = _mm_loadu_ps(Cjni);
			    __m128 Cij1 = _mm_loadu_ps(Cjni1);
			    __m128 Cij2 = _mm_loadu_ps(Cjni2);
			    __m128 Cij3 = _mm_loadu_ps(Cjni3);
			    __m128 Cij4 = _mm_loadu_ps(Cjni4);
			    __m128 Cij5 = _mm_loadu_ps(Cjni5);
			    __m128 Cij6 = _mm_loadu_ps(Cjni6);
			    __m128 Cij7 = _mm_loadu_ps(Cjni7);


			    // for (int k = b; k < b+BLOCKSIZE && k < m; k++) {
			    for (int k = 0; k < m; k++) {
					int k1 = k + 1; float *Akn = B+k*n;
					__m128 Ajk = _mm_load1_ps(Akn+jn1);

					__m128 Aik = _mm_loadu_ps(Akn+i);
					__m128 Ai1k = _mm_loadu_ps(Akn+i1);
					__m128 Ai2k = _mm_loadu_ps(Akn+i2);
					__m128 Ai3k = _mm_loadu_ps(Akn+i3);
					__m128 Ai4k = _mm_loadu_ps(Akn+i4);
					__m128 Ai5k = _mm_loadu_ps(Akn+i5);
					__m128 Ai6k = _mm_loadu_ps(Akn+i6);
					__m128 Ai7k = _mm_loadu_ps(Akn+i7);

					Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
					Cij1 = _mm_add_ps(Cij1, _mm_mul_ps(Ajk, Ai1k));
					Cij2 = _mm_add_ps(Cij2, _mm_mul_ps(Ajk, Ai2k));
					Cij3 = _mm_add_ps(Cij3, _mm_mul_ps(Ajk, Ai3k));
					Cij4 = _mm_add_ps(Cij4, _mm_mul_ps(Ajk, Ai4k));
					Cij5 = _mm_add_ps(Cij5, _mm_mul_ps(Ajk, Ai5k));
					Cij6 = _mm_add_ps(Cij6, _mm_mul_ps(Ajk, Ai6k));
					Cij7 = _mm_add_ps(Cij7, _mm_mul_ps(Ajk, Ai7k));
			    }
			    _mm_storeu_ps(Cjni, Cij);
			    _mm_storeu_ps(Cjni1, Cij1);
			    _mm_storeu_ps(Cjni2, Cij2);
			    _mm_storeu_ps(Cjni3, Cij3);
			    _mm_storeu_ps(Cjni4, Cij4);
			    _mm_storeu_ps(Cjni5, Cij5);
			    _mm_storeu_ps(Cjni6, Cij6);
			    _mm_storeu_ps(Cjni7, Cij7);
			}
		// }
    }
    if (n % VERTICAL_ROLL != 0 && (n - (nEnd) >= 4)) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++) {
			for (int i = nEnd; i < n/4*4; i+=4) {
				float *addrCij = D+i+j*n;
				float *Ajn1 = B+j*n1;
				float *Ai = A+i;
				__m128 Cij = _mm_loadu_ps(addrCij);
				for (int k = 0; k < m; k++) {
				    int kn = k*n;				    
				    __m128 Ajk = _mm_load1_ps(Ajn1+k*n);
				    __m128 Aik = _mm_loadu_ps(Ai+k*n);
				    Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
				}
				_mm_storeu_ps(addrCij, Cij);
			}
		}
    }
    if ((n - nEnd) % 4 != 0) {
		#pragma omp parallel for
		for (int j = 0; j < n; j++) {
		    float *Ajn1 = B+j*n1;
		    for (int i = n/4*4; i < n; i++) {
			float *addrCij = D+i+j*n;
			float *Ajn1 = B+j*n1;
			float *Ai = B+i;
			__m128 Cij = _mm_loadu_ps(addrCij);
			for (int k = 0; k < m; k++) {
			    int kn = k*n;
			    __m128 Ajk = _mm_load1_ps(Ajn1+kn);
			    __m128 Aik = _mm_loadu_ps(Ai+kn);
			    Cij = _mm_add_ps(Cij, _mm_mul_ps(Ajk, Aik));
			}
			_mm_store_ss(addrCij, Cij);
		    }
		}	
	}	
}	
Example #19
0
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_soften_data_t *data = (dt_iop_soften_data_t *)piece->data;
  float *in  = (float *)ivoid;
  float *out = (float *)ovoid;
  const int ch = piece->colors;

  const float brightness = 1.0 / exp2f ( -data->brightness );
  const float saturation = data->saturation/100.0;
  /* create overexpose image and then blur */
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(in,out,roi_out) schedule(static)
#endif
  for(size_t k=0; k<(size_t)roi_out->width*roi_out->height; k++)
  {
    size_t index = ch*k;
    float h,s,l;
    rgb2hsl(&in[index],&h,&s,&l);
    s*=saturation;
    l*=brightness;
    hsl2rgb(&out[index],h,CLIP(s),CLIP(l));
  }

  const float w = piece->iwidth*piece->iscale;
  const float h = piece->iheight*piece->iscale;
  int mrad = sqrt( w*w + h*h) * 0.01;
  int rad = mrad*(fmin(100.0,data->size+1)/100.0);
  const int radius = MIN(mrad, ceilf(rad * roi_in->scale / piece->iscale));

  const int size = roi_out->width > roi_out->height ? roi_out->width : roi_out->height;

  for(int iteration=0; iteration<BOX_ITERATIONS; iteration++)
  {
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(out,roi_out) schedule(static)
#endif
    /* horizontal blur out into out */
    for(int y=0; y<roi_out->height; y++)
    {
      __m128 scanline[size];
      size_t index = (size_t)y * roi_out->width;
      __m128 L = _mm_setzero_ps();
      int hits = 0;
      for(int x=-radius; x<roi_out->width; x++)
      {
        int op = x - radius-1;
        int np = x+radius;
        if(op>=0)
        {
          L = _mm_sub_ps(L, _mm_load_ps(&out[(index+op)*ch]));
          hits--;
        }
        if(np < roi_out->width)
        {
          L =  _mm_add_ps(L, _mm_load_ps(&out[(index+np)*ch]));
          hits++;
        }
        if(x>=0)
          scanline[x] = _mm_div_ps(L, _mm_set_ps1(hits));
      }

      for (int x=0; x<roi_out->width; x++)
        _mm_store_ps(&out[(index+x)*ch], scanline[x]);
    }

    /* vertical pass on blurlightness */
    const int opoffs = -(radius+1)*roi_out->width;
    const int npoffs = (radius)*roi_out->width;
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(out,roi_out) schedule(static)
#endif
    for(int x=0; x < roi_out->width; x++)
    {
      __m128 scanline[size];
      __m128 L = _mm_setzero_ps();
      int hits=0;
      size_t index = (size_t)x - radius*roi_out->width;
      for(int y=-radius; y<roi_out->height; y++)
      {
        int op=y-radius-1;
        int np= y + radius;

        if(op>=0)
        {
          L = _mm_sub_ps(L, _mm_load_ps(&out[(index+opoffs)*ch]));
          hits--;
        }
        if(np < roi_out->height)
        {
          L = _mm_add_ps(L, _mm_load_ps(&out[(index+npoffs)*ch]));
          hits++;
        }
        if(y>=0)
          scanline[y] = _mm_div_ps(L, _mm_set_ps1(hits));
        index += roi_out->width;
      }

      for (int y=0; y<roi_out->height; y++)
        _mm_store_ps(&out[((size_t)y*roi_out->width+x)*ch], scanline[y]);
    }
  }


  const __m128 amount = _mm_set1_ps(data->amount/100.0);
  const __m128 amount_1 = _mm_set1_ps(1-(data->amount)/100.0);
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(roi_out, in, out, data) schedule(static)
#endif
  for(size_t k=0; k<(size_t)roi_out->width*roi_out->height; k++)
  {
    int index = ch*k;
    _mm_store_ps(&out[index],
                 _mm_add_ps(_mm_mul_ps(_mm_load_ps(&in[index]), amount_1),
                            _mm_mul_ps(MM_CLIP_PS(_mm_load_ps(&out[index])), amount)));
  }
}
Example #20
0
 /*!
  * \brief Perform an horizontal sum of the given vector.
  * \param in The input vector type
  * \return the horizontal sum of the vector
  */
 ETL_STATIC_INLINE(float) hadd(avx_simd_float in) {
     const __m128 x128 = _mm_add_ps(_mm256_extractf128_ps(in.value, 1), _mm256_castps256_ps128(in.value));
     const __m128 x64  = _mm_add_ps(x128, _mm_movehl_ps(x128, x128));
     const __m128 x32  = _mm_add_ss(x64, _mm_shuffle_ps(x64, x64, 0x55));
     return _mm_cvtss_f32(x32);
 }
Example #21
0
void kernel_strmv_u_t_4_lib8(int kmax, float *A, int sda, float *x, float *y, int alg)
	{

/*	if(kmax<=0) */
/*		return;*/
	
	const int lda = 8;
/*	const int bs  = 8;*/
	
	__builtin_prefetch( A + 0*lda );
	__builtin_prefetch( A + 2*lda );

	int
		k;
	
	__m256
		zeros,
		ax_temp,
		a_00, a_01, a_02, a_03,
		x_0,
		y_0, y_1, y_2, y_3, y_4, y_5, y_6, y_7;
	
	zeros = _mm256_setzero_ps();

	y_0 = _mm256_setzero_ps();
	y_1 = _mm256_setzero_ps();
	y_2 = _mm256_setzero_ps();
	y_3 = _mm256_setzero_ps();
	y_4 = _mm256_setzero_ps();
	y_5 = _mm256_setzero_ps();
	y_6 = _mm256_setzero_ps();
	y_7 = _mm256_setzero_ps();
	
	k=0;
	for(; k<kmax-7; k+=8)
		{
		
		x_0 = _mm256_loadu_ps( &x[0] );

		__builtin_prefetch( A + sda*lda + 0*lda );
		__builtin_prefetch( A + sda*lda + 2*lda );

		a_00 = _mm256_load_ps( &A[0+lda*0] );
		ax_temp = _mm256_mul_ps( a_00, x_0 );
		y_0 = _mm256_add_ps( y_0, ax_temp );
		a_01 = _mm256_load_ps( &A[0+lda*1] );
		ax_temp = _mm256_mul_ps( a_01, x_0 );
		y_1 = _mm256_add_ps( y_1, ax_temp );
		a_02 = _mm256_load_ps( &A[0+lda*2] );
		ax_temp = _mm256_mul_ps( a_02, x_0 );
		y_2 = _mm256_add_ps( y_2, ax_temp );
		a_03 = _mm256_load_ps( &A[0+lda*3] );
		ax_temp = _mm256_mul_ps( a_03, x_0 );
		y_3 = _mm256_add_ps( y_3, ax_temp );
		
		A += sda*lda;
		x += 8;

		}

	x_0 = _mm256_loadu_ps( &x[0] );

	a_00 = _mm256_load_ps( &A[0+lda*0] );
	a_00 = _mm256_blend_ps( zeros, a_00, 0x01 );
	ax_temp = _mm256_mul_ps( a_00, x_0 );
	y_0 = _mm256_add_ps( y_0, ax_temp );
	a_01 = _mm256_load_ps( &A[0+lda*1] );
	a_01 = _mm256_blend_ps( zeros, a_01, 0x03 );
	ax_temp = _mm256_mul_ps( a_01, x_0 );
	y_1 = _mm256_add_ps( y_1, ax_temp );
	a_02 = _mm256_load_ps( &A[0+lda*2] );
	a_02 = _mm256_blend_ps( zeros, a_02, 0x07 );
	ax_temp = _mm256_mul_ps( a_02, x_0 );
	y_2 = _mm256_add_ps( y_2, ax_temp );
	a_03 = _mm256_load_ps( &A[0+lda*3] );
	a_03 = _mm256_blend_ps( zeros, a_03, 0x0f );
	ax_temp = _mm256_mul_ps( a_03, x_0 );
	y_3 = _mm256_add_ps( y_3, ax_temp );

	// reduction
	__m128
		z_0, z_1;

	y_0 = _mm256_hadd_ps(y_0, y_1);
	y_2 = _mm256_hadd_ps(y_2, y_3);

	y_0 = _mm256_hadd_ps(y_0, y_2);

	y_1 = _mm256_permute2f128_ps(y_0, y_0, 0x01);
	
	z_0 = _mm256_castps256_ps128(y_0);
	z_1 = _mm256_castps256_ps128(y_1);
	
	z_1 = _mm_add_ps(z_0, z_1);

	if(alg==0)
		{
		_mm_storeu_ps(&y[0], z_1);
		}
	else if(alg==1)
		{
		z_0 = _mm_loadu_ps( &y[0] );

		z_0 = _mm_add_ps(z_0, z_1);

		_mm_storeu_ps(&y[0], z_0);
		}
	else // alg==-1
		{
		z_0 = _mm_loadu_ps( &y[0] );

		z_0 = _mm_sub_ps(z_0, z_1);

		_mm_storeu_ps(&y[0], z_0);
		}

	}
Example #22
0
	float Matrix4_M128::Inverse(Matrix4_M128 &mOut) const
	{
		__m128 Fac0;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac0 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac1;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac1 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac2;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac2 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac3;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(3, 3, 3, 3));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(3, 3, 3, 3));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac3 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac4;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(2, 2, 2, 2));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(2, 2, 2, 2));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac4 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 Fac5;
		{
			__m128 Swp0a = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(1, 1, 1, 1));
			__m128 Swp0b = _mm_shuffle_ps(C4, C3, _MM_SHUFFLE(0, 0, 0, 0));

			__m128 Swp00 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(0, 0, 0, 0));
			__m128 Swp01 = _mm_shuffle_ps(Swp0a, Swp0a, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp02 = _mm_shuffle_ps(Swp0b, Swp0b, _MM_SHUFFLE(2, 0, 0, 0));
			__m128 Swp03 = _mm_shuffle_ps(C3, C2, _MM_SHUFFLE(1, 1, 1, 1));

			__m128 Mul00 = _mm_mul_ps(Swp00, Swp01);
			__m128 Mul01 = _mm_mul_ps(Swp02, Swp03);
			Fac5 = _mm_sub_ps(Mul00, Mul01);
		}

		__m128 SignA = _mm_set_ps( 1.0f,-1.0f, 1.0f,-1.0f);
		__m128 SignB = _mm_set_ps(-1.0f, 1.0f,-1.0f, 1.0f);

		__m128 Temp0 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(0, 0, 0, 0));
		__m128 Vec0 = _mm_shuffle_ps(Temp0, Temp0, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Temp1 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(1, 1, 1, 1));
		__m128 Vec1 = _mm_shuffle_ps(Temp1, Temp1, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Temp2 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(2, 2, 2, 2));
		__m128 Vec2 = _mm_shuffle_ps(Temp2, Temp2, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Temp3 = _mm_shuffle_ps(C2, C1, _MM_SHUFFLE(3, 3, 3, 3));
		__m128 Vec3 = _mm_shuffle_ps(Temp3, Temp3, _MM_SHUFFLE(2, 2, 2, 0));

		__m128 Mul00 = _mm_mul_ps(Vec1, Fac0);
		__m128 Mul01 = _mm_mul_ps(Vec2, Fac1);
		__m128 Mul02 = _mm_mul_ps(Vec3, Fac2);
		__m128 Sub00 = _mm_sub_ps(Mul00, Mul01);
		__m128 Add00 = _mm_add_ps(Sub00, Mul02);
		__m128 Inv0 = _mm_mul_ps(SignB, Add00);

		__m128 Mul03 = _mm_mul_ps(Vec0, Fac0);
		__m128 Mul04 = _mm_mul_ps(Vec2, Fac3);
		__m128 Mul05 = _mm_mul_ps(Vec3, Fac4);
		__m128 Sub01 = _mm_sub_ps(Mul03, Mul04);
		__m128 Add01 = _mm_add_ps(Sub01, Mul05);
		__m128 Inv1 = _mm_mul_ps(SignA, Add01);

		__m128 Mul06 = _mm_mul_ps(Vec0, Fac1);
		__m128 Mul07 = _mm_mul_ps(Vec1, Fac3);
		__m128 Mul08 = _mm_mul_ps(Vec3, Fac5);
		__m128 Sub02 = _mm_sub_ps(Mul06, Mul07);
		__m128 Add02 = _mm_add_ps(Sub02, Mul08);
		__m128 Inv2 = _mm_mul_ps(SignB, Add02);

		__m128 Mul09 = _mm_mul_ps(Vec0, Fac2);
		__m128 Mul10 = _mm_mul_ps(Vec1, Fac4);
		__m128 Mul11 = _mm_mul_ps(Vec2, Fac5);
		__m128 Sub03 = _mm_sub_ps(Mul09, Mul10);
		__m128 Add03 = _mm_add_ps(Sub03, Mul11);
		__m128 Inv3 = _mm_mul_ps(SignA, Add03);

		__m128 Row0 = _mm_shuffle_ps(Inv0, Inv1, _MM_SHUFFLE(0, 0, 0, 0));
		__m128 Row1 = _mm_shuffle_ps(Inv2, Inv3, _MM_SHUFFLE(0, 0, 0, 0));
		__m128 Row2 = _mm_shuffle_ps(Row0, Row1, _MM_SHUFFLE(2, 0, 2, 0));

		// Det0 = dot(C1, Row2)
		__m128 mul0 = _mm_mul_ps(C1, Row2);
		__m128 swp0 = _mm_shuffle_ps(mul0, mul0, _MM_SHUFFLE(2, 3, 0, 1));
		__m128 add0 = _mm_add_ps(mul0, swp0);
		__m128 swp1 = _mm_shuffle_ps(add0, add0, _MM_SHUFFLE(0, 1, 2, 3));
		__m128 Det0 = _mm_add_ps(add0, swp1);

		__m128 Rcp0 = _mm_div_ps(VecOne, Det0);

		mOut.C1 = _mm_mul_ps(Inv0, Rcp0);
		mOut.C2 = _mm_mul_ps(Inv1, Rcp0);
		mOut.C3 = _mm_mul_ps(Inv2, Rcp0);
		mOut.C4 = _mm_mul_ps(Inv3, Rcp0);

		float retVal;
		_mm_store_ss(&retVal, Det0);
		return retVal;
	}
void sgemm( int m, int n, float *A, float *C )
{
    __m128 a;
    __m128 a1;
    __m128 a2; 
    __m128 a3;
    __m128 a4;
    __m128 a5;
    
    __m128 b;
    __m128 b1;
    __m128 b2;
    __m128 b3;
    __m128 b4;
    __m128 b5;
    __m128 b6;
    __m128 b7;
    __m128 b8;
    __m128 b9;
    __m128 b10;
    __m128 b11;
    __m128 b12;
    /*
    __m128 b13;
    __m128 b14;
    __m128 b15;
    __m128 b16;
    __m128 b17;
    __m128 b18;
    __m128 b19;
    __m128 b20;
    */
    
    __m128 c;
    __m128 c1;
    __m128 c2;
    __m128 c3;
    __m128 c4;
    
    int i, j, k, l;
    int mod = m%4;
    int end = m/4 * 4;
    int total = n*m;
    float num[4];
    float* A_address;
    float* C_address;
    int m3 = 3 * m;
    int m2 = 2 * m;
    int end1 = total/m3 * m3;
#pragma omp parallel for private(a, a1, a2, a3, b, b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, c, c1, c2, c3, c4, i, j, k, l)
    for( i = 0; i < end; i += 4 ){
	for( k = 0; k < end; k += 4 ) {
	    c1 = _mm_setzero_ps();
	    c2 = _mm_setzero_ps();
	    c3 = _mm_setzero_ps();
	    c4 = _mm_setzero_ps();
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    float* A_address21 = A + k + 1;
	    for( j = 0; j < end1; j += m3, A_address1 += m3, A_address2 += m3, A_address21 += m3){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A_address1 + m);
		a3 = _mm_loadu_ps(A_address1 + m2);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A_address2 + m);
		b3 = _mm_load1_ps(A_address2 + m2);
		/*
		b4 = _mm_load1_ps(A_address2 + m3);
		b5 = _mm_load1_ps(A_address2 + m4);
		*/
		
		b4 = _mm_load1_ps(A_address21);
		b5 = _mm_load1_ps(A_address21 + m);
		b6 = _mm_load1_ps(A_address21 + m2);
		/*
		b9 = _mm_load1_ps(A_address21 + m3);
		b10 = _mm_load1_ps(A_address21 + m4);
		*/
		b7 = _mm_load1_ps(A + k + 2 + j);
		b8 = _mm_load1_ps(A + k + 2 + j + m);
		b9 = _mm_load1_ps(A + k + 2 + j + m2);
		/*
		b14 = _mm_load1_ps(A + k + 2 + j + m3);
		b15 = _mm_load1_ps(A + k + 2 + j + m4);
		*/
		
		b10 = _mm_load1_ps(A + k + 3 + j);
		b11 = _mm_load1_ps(A + k + 3 + j + m);
		b12 = _mm_load1_ps(A + k + 3 + j + m2);
		/*
		b19 = _mm_load1_ps(A + k + 3 + j + m3);
		b20 = _mm_load1_ps(A + k + 3 + j + m4);
		*/
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a1, b1));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a2, b2));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a3, b3));
		/*
		c1 = _mm_add_ps(c1, _mm_mul_ps(a4, b4));
		c1 = _mm_add_ps(c1, _mm_mul_ps(a5, b5));
		*/
		c2 = _mm_add_ps(c2, _mm_mul_ps(a1, b4));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a2, b5));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a3, b6));
		/*
		c2 = _mm_add_ps(c2, _mm_mul_ps(a4, b9));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a5, b10));
		*/
		c3 = _mm_add_ps(c3, _mm_mul_ps(a1, b7));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a2, b8));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a3, b9));
		/*
		c3 = _mm_add_ps(c3, _mm_mul_ps(a4, b14));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a5, b15));
		*/
		
		c4 = _mm_add_ps(c4, _mm_mul_ps(a1, b10));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a2, b11));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a3, b12));
		/*
		c4 = _mm_add_ps(c4, _mm_mul_ps(a4, b19));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a5, b20));
		*/
		
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b1 = _mm_load1_ps(A + k + j);
		b2 = _mm_load1_ps(A + k + 1 + j);
		b3 = _mm_load1_ps(A + k + 2 + j);
		b4 = _mm_load1_ps(A + k + 3 + j);
		
		c1 = _mm_add_ps(c1, _mm_mul_ps(a, b1));
		c2 = _mm_add_ps(c2, _mm_mul_ps(a, b2));
		c3 = _mm_add_ps(c3, _mm_mul_ps(a, b3));
		c4 = _mm_add_ps(c4, _mm_mul_ps(a, b4));
	    }
	    _mm_storeu_ps(C + i + (k)*m, c1);
	    _mm_storeu_ps(C + i + (k+1)*m, c2);
	    _mm_storeu_ps(C + i + (k+2)*m, c3);
	    _mm_storeu_ps(C + i + (k+3)*m, c4);
	}
	for(k = end; k < m; k++){
	    float* A_address1 = A + i;
	    float* A_address2 = A + k;
	    c = _mm_setzero_ps();
	    for( j = 0; j < end1; j += m3, A_address1 += m3, A_address2 += m3){
		a1 = _mm_loadu_ps(A_address1);
		a2 = _mm_loadu_ps(A + i + j + m);
		a3 = _mm_loadu_ps(A + i + j + m2);
		
		b1 = _mm_load1_ps(A_address2);
		b2 = _mm_load1_ps(A + k + j + m);
		b3 = _mm_load1_ps(A + k + j + m2);
		
		c = _mm_add_ps(c, _mm_mul_ps(a1, b1));
		c = _mm_add_ps(c, _mm_mul_ps(a2, b2));
		c = _mm_add_ps(c, _mm_mul_ps(a3, b3));
	    }
	    for( j = end1; j < total; j += m){
		a = _mm_loadu_ps(A + i + j);
		
		b = _mm_load1_ps(A + k + j);
		
		c = _mm_add_ps(c, _mm_mul_ps(a, b));
	    }
	    _mm_storeu_ps(C + i + k*m, c);
	}
    }
    if (mod != 0){
	if (mod == 3){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),*(A_address + 2), 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 3; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 2){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address),*(A_address + 1),0 ,0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 2; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
	else if (mod == 1){
	    for( i = end; i < m; i +=4 ){
		for( k = 0; k < m; k++ ) {
		    A_address = A + i;
		    c = _mm_setzero_ps();
		    for( j = 0; j < total; j += m ) {
			a = _mm_setr_ps(*(A_address), 0, 0, 0);
			b = _mm_load1_ps(A + k + j);
			c = _mm_add_ps(c, _mm_mul_ps(a, b));
			A_address += m;
		    }
		    _mm_storeu_ps(num, c);
		    for (l = 0; l < 1; l ++){
			*(C + i + k*m + l) = num[l];
		    }
		}
	    }
	}
    }
}	
Example #24
0
void mexFunction(int nlhs, mxArray *plhs[],
                 int nrhs, const mxArray *prhs[])
{
        const float * kf   = coeff;
        float * src = _src;
        float * dst = _dst;
        int i = 0, k, nz = length;
        
        // float delta = 0.000001f;
        __m128 d4 = _mm_setzero_ps();
        
        float * S;
        
        __m128 s0, s1, s2, s3, 
               t0, t1, t2, t3;
        __m128 f;
        
        for(i = 0; i <= width - 16; i += 16 )
        {
            s0 = d4, s1 = d4, s2 = d4, s3 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);  // (__m128 f, __m128 f, unsigned int imm8)
                S = src + i + k;

                t0 = _mm_loadu_ps(S);
                t1 = _mm_loadu_ps(S + 4);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
                s1 = _mm_add_ps(s1, _mm_mul_ps(t1, f));

                t0 = _mm_loadu_ps(S + 8);
                t1 = _mm_loadu_ps(S + 12);
                s2 = _mm_add_ps(s2, _mm_mul_ps(t0, f));
                s3 = _mm_add_ps(s3, _mm_mul_ps(t1, f));
            }

            _mm_storeu_ps(dst + i, s0);
            _mm_storeu_ps(dst + i + 4, s1);
            _mm_storeu_ps(dst + i + 8, s2);
            _mm_storeu_ps(dst + i + 12, s3);
        }
// 
        for( ; i <= width - 4; i += 4 )
        {
            s0 = d4;

            for( k = 0; k < nz; k++ )
            {
                f = _mm_load_ss(kf + k);
                f = _mm_shuffle_ps(f, f, 0);
                t0 = _mm_loadu_ps(src + k + i);
                s0 = _mm_add_ps(s0, _mm_mul_ps(t0, f));
            }
            _mm_storeu_ps(dst + i, s0);
        }
        
        for (; i < width; i++)
        {
            for( k = 0; k < nz; k++ )
            {
                *(dst + i) += *(src + i + k) * *(kf + k); 
            }
        }

        return;
}
Example #25
0
			inline Matrix4x4<float> operator*(Matrix4x4<float> const &ma) const {
				Matrix4x4<float> val;

#ifdef __SSE_AVAIL__
				__m128 m0 = _mm_load_ps(Row1);
				__m128 m1 = _mm_load_ps(Row2);
				__m128 m2 = _mm_load_ps(Row3);
				__m128 m3 = _mm_load_ps(Row4);


				__m128 brod1 = _mm_set1_ps(ma.Mat[0]);
				__m128 brod2 = _mm_set1_ps(ma.Mat[1]);
				__m128 brod3 = _mm_set1_ps(ma.Mat[2]);
				__m128 brod4 = _mm_set1_ps(ma.Mat[3]);
				__m128 row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(val.Row1, row);

				brod1 = _mm_set1_ps(ma.Mat[4]);
				brod2 = _mm_set1_ps(ma.Mat[5]);
				brod3 = _mm_set1_ps(ma.Mat[6]);
				brod4 = _mm_set1_ps(ma.Mat[7]);
				row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(val.Row2, row);

				brod1 = _mm_set1_ps(ma.Mat[8]);
				brod2 = _mm_set1_ps(ma.Mat[9]);
				brod3 = _mm_set1_ps(ma.Mat[10]);
				brod4 = _mm_set1_ps(ma.Mat[11]);
				row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(val.Row3, row);

				brod1 = _mm_set1_ps(ma.Mat[12]);
				brod2 = _mm_set1_ps(ma.Mat[13]);
				brod3 = _mm_set1_ps(ma.Mat[14]);
				brod4 = _mm_set1_ps(ma.Mat[15]);
				row = _mm_add_ps(
					_mm_add_ps(
					_mm_mul_ps(brod1, m0),
					_mm_mul_ps(brod2, m1)),
					_mm_add_ps(
					_mm_mul_ps(brod3, m2),
					_mm_mul_ps(brod4, m3)));
				_mm_store_ps(val.Row4, row);

				
#else

#endif

				return val;
			}
Example #26
0
static void process_sinc(rarch_sinc_resampler_t *resamp, float *out_buffer)
{
    unsigned i;
    __m128 sum;
    __m128 sum_l             = _mm_setzero_ps();
    __m128 sum_r             = _mm_setzero_ps();

    const float *buffer_l    = resamp->buffer_l + resamp->ptr;
    const float *buffer_r    = resamp->buffer_r + resamp->ptr;

    unsigned taps            = resamp->taps;
    unsigned phase           = resamp->time >> SUBPHASE_BITS;
#if SINC_COEFF_LERP
    const float *phase_table = resamp->phase_table + phase * taps * 2;
    const float *delta_table = phase_table + taps;
    __m128 delta             = _mm_set1_ps((float)
                                           (resamp->time & SUBPHASE_MASK) * SUBPHASE_MOD);
#else
    const float *phase_table = resamp->phase_table + phase * taps;
#endif

    for (i = 0; i < taps; i += 4)
    {
        __m128 buf_l = _mm_loadu_ps(buffer_l + i);
        __m128 buf_r = _mm_loadu_ps(buffer_r + i);

#if SINC_COEFF_LERP
        __m128 deltas = _mm_load_ps(delta_table + i);
        __m128 _sinc  = _mm_add_ps(_mm_load_ps(phase_table + i),
                                   _mm_mul_ps(deltas, delta));
#else
        __m128 _sinc = _mm_load_ps(phase_table + i);
#endif
        sum_l        = _mm_add_ps(sum_l, _mm_mul_ps(buf_l, _sinc));
        sum_r        = _mm_add_ps(sum_r, _mm_mul_ps(buf_r, _sinc));
    }

    /* Them annoying shuffles.
     * sum_l = { l3, l2, l1, l0 }
     * sum_r = { r3, r2, r1, r0 }
     */

    sum = _mm_add_ps(_mm_shuffle_ps(sum_l, sum_r,
                                    _MM_SHUFFLE(1, 0, 1, 0)),
                     _mm_shuffle_ps(sum_l, sum_r, _MM_SHUFFLE(3, 2, 3, 2)));

    /* sum   = { r1, r0, l1, l0 } + { r3, r2, l3, l2 }
     * sum   = { R1, R0, L1, L0 }
     */

    sum = _mm_add_ps(_mm_shuffle_ps(sum, sum, _MM_SHUFFLE(3, 3, 1, 1)), sum);

    /* sum   = {R1, R1, L1, L1 } + { R1, R0, L1, L0 }
     * sum   = { X,  R,  X,  L }
     */

    /* Store L */
    _mm_store_ss(out_buffer + 0, sum);

    /* movehl { X, R, X, L } == { X, R, X, R } */
    _mm_store_ss(out_buffer + 1, _mm_movehl_ps(sum, sum));
}
Example #27
0
Vector4<float>
Vector4<float>::add(const Vector4<float>& vector) const
{
    return Vector4<float>(_mm_add_ps(this->asSSE(), vector.asSSE()));
}
Example #28
0
inline wg_v4sf Recognizer::dtw4(float *s, unsigned int n, 
                                float *t0, unsigned int m0,
                                float *t1, unsigned int m1,
                                float *t2, unsigned int m2,
                                float *t3, unsigned int m3) {
    /*
    Compare an input sequence with 4 reference sequences.

    For one column of the DTW matrix, MIN4(m0,m1,m2,m3) cells are calculated
    using vector instructions. The rest of the cells are calculated
    sequentially.

    s: input sequence
    n: number of vectors in s

    t0..t3: reference sequences
    m0..m3: number of vectors in the sequence
    */
    unsigned int i, j, common;
    wg_v4sf cost;
    float costf;
    float *t_start0, *t_start1, *t_start2, *t_start3;
    wg_v4sf *tmp;
    wg_v4sf res;

    t_start0 = t0; t_start1 = t1; t_start2 = t2; t_start3 = t3;

    /* Initialize the edge cells */
    dtw1v[0].v = _mm_set_ps1(0);
    dtw2v[0].v = _mm_set_ps1(FLT_MAX);

    for (i=1; i < MAX4(m0,m1,m2,m3); i++)
        dtw1v[i].v = _mm_set_ps1(FLT_MAX);

    s += VEC_DIM_MAX;
   
    common = MIN4(m0,m1,m2,m3);

    /* Iterate over columns */
    for (i=1; i < n; i++) {
        t0 = t_start0 + VEC_DIM_MAX; t1 = t_start1 + VEC_DIM_MAX;
        t2 = t_start2 + VEC_DIM_MAX; t3 = t_start3 + VEC_DIM_MAX;

        /* Iterate over cells of that column */
        /* Process 4 cells at a time in parallel */
        for (j=1; j < common; j++) {
            cost = local_distance4(s, t0, t1, t2, t3);
            /* Inductive step */
            dtw2v[j].v = _mm_add_ps(cost.v,
                                MIN3VEC(dtw2v[j-1].v,dtw1v[j].v,dtw1v[j-1].v));

            t0 += VEC_DIM_MAX; t1 += VEC_DIM_MAX;
            t2 += VEC_DIM_MAX; t3 += VEC_DIM_MAX;
        }

        /* The remaining of cells is calculated sequentially */
        DTW4_PROCESS_REMAINING(0, m0, t0);
        DTW4_PROCESS_REMAINING(1, m1, t1);
        DTW4_PROCESS_REMAINING(2, m2, t2);
        DTW4_PROCESS_REMAINING(3, m3, t3);

        SWAP(dtw1v,dtw2v,tmp);
        dtw2v[0].v = _mm_set_ps1(FLT_MAX);

        s += VEC_DIM_MAX;
    }

    res.s[0] = dtw1v[m0-1].s[0]; res.s[1] = dtw1v[m1-1].s[1];
    res.s[2] = dtw1v[m2-1].s[2]; res.s[3] = dtw1v[m3-1].s[3];
    
    return res;
}
Example #29
0
int conv2D(float* in, float* out, int data_size_X, int data_size_Y,
                    float* kernel)
{   
    omp_set_num_threads(16);
    float t[9];
    kernel=memcpy(t, kernel,36);
    #pragma omp parallel shared(kernel, out, in)
    {
        #pragma omp for
        for (int y = 1; y < data_size_Y-1; y++){
            int lastColumn = data_size_X-1;
            float first = 0, last = 0;
            for(int rowNum = -1; rowNum <= 1; rowNum++){
                float *firsttemp =  (y+rowNum)*data_size_X+in, *lasttemp =  (y+rowNum)*data_size_X+lastColumn+in;
                int kerPositionTemp =(1-rowNum)*3;
                for(int col = -1; col <= 1; col++){
                    float *kernelPosition= (1-col)+kerPositionTemp+kernel;
                    if(y+rowNum!=data_size_Y&&y+rowNum!=-1 ){
                        if(col!=-1 && col!=data_size_X) first += *(kernelPosition) * *(firsttemp+col);                    
                        if(lastColumn+col!=-1  && lastColumn+col!=data_size_X) last += *(kernelPosition) * *(lasttemp+col);
                    }
                }
            }
            int row = y*data_size_X;
            out[row] = first;
            out[lastColumn+row] = last;
        }
    }


    #pragma omp parallel shared(kernel, out, in)
    {
        #pragma omp for
        for (int x = 0;x<data_size_X;x++){
            int  lastLineNum=data_size_Y-1;
            float first = 0, last = 0;
            for(int j = -1; j <= 1; j++){

                int firstLineTemp = j*data_size_X+x,
                    secondLineTemp = (lastLineNum+j)*data_size_X+x,
                    kerPositionTemp =(1-j)*3;

                for(int i = -1; i <= 1; i++){
                    int kernelPosition= (1-i)+kerPositionTemp;
                    if(x+i>-1 && x+i<data_size_X ){
                        if(j!=-1 && j!=data_size_Y) first += kernel[kernelPosition] * in[i + firstLineTemp];                    
                        if(lastLineNum+j!=-1 && lastLineNum+j!=data_size_Y) last  += kernel[kernelPosition] * in[i + secondLineTemp];
                    }
                }
            }
            out[x] = first;
            out[x+lastLineNum*data_size_X] = last;
        }
    }  

    float tt[9];
    kernel=memcpy(tt, kernel,36);

omp_set_num_threads(16);
#pragma  omp parallel 
{
    #pragma omp for 
    for (int y = 1; y < data_size_Y-1; y++){   
        int row = y*data_size_X, x = 1;
        for (; x < (data_size_X-2)/32*32; x+=32){
            __m128 vector1 = _mm_set1_ps(0.0f);
            __m128 vector2=  vector1;
            __m128 vector3 = vector1;
            __m128 vector4 = vector1;
            __m128 vector5 = vector1;
            __m128 vector6 = vector1;
            __m128 vector7 = vector1;
            __m128 vector8 = vector1;
            for (int row = -1; row < 2; row++){
                int temp = (y+row)*data_size_X;
                int kerPositionTemp = (1-row)*3;
                for (int col = -1; col < 2; col++){                 
                    __m128 kerVal = _mm_set1_ps(kernel[(1-col)+kerPositionTemp]);
                    vector1 = _mm_add_ps (vector1, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp),kerVal));
                    vector2 = _mm_add_ps (vector2, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+4),kerVal));
                    vector3 = _mm_add_ps (vector3, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+8),kerVal));
                    vector4 = _mm_add_ps (vector4, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+12),kerVal));
                    vector5 = _mm_add_ps (vector5, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+16),kerVal));
                    vector6 = _mm_add_ps (vector6, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+20),kerVal));
                    vector7 = _mm_add_ps (vector7, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+24),kerVal));
                    vector8 = _mm_add_ps (vector8, _mm_mul_ps(_mm_loadu_ps(in+x+col+temp+28),kerVal));
                }

            }
            int ot = x+row;
            _mm_storeu_ps(out+ot, vector1);
            _mm_storeu_ps(out+ot+4, vector2);
            _mm_storeu_ps(out+ot+8, vector3);
            _mm_storeu_ps(out+ot+12, vector4);
            _mm_storeu_ps(out+ot+16, vector5);
            _mm_storeu_ps(out+ot+20, vector6);
            _mm_storeu_ps(out+ot+24, vector7);
            _mm_storeu_ps(out+ot+28, vector8);

        }
        for (;x<data_size_X-1; x++){
                for(int i = -1; i <= 1; i++){
                    for(int j = -1; j <= 1; j++){
                        out[x+y*data_size_X] +=
                        kernel[(1-i)+(1-j)*KERNX] * in[(x+i) + (y+j)*data_size_X];
                    }
                }
            }
        }
}

   


    return 1;
}
Example #30
0
static void GF_FUNC_ALIGN VS_CC
proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width,
                int height, int stride, uint8_t *d, const uint8_t *s)
{
    const uint16_t *srcp = (uint16_t *)s;
    uint16_t *dstp = (uint16_t *)d;
    stride /= 2;
    bstride /= 2;

    uint16_t *p0 = (uint16_t *)buff + 8;
    uint16_t *p1 = p0 + bstride;
    uint16_t *p2 = p1 + bstride;
    uint16_t *p3 = p2 + bstride;
    uint16_t *p4 = p3 + bstride;
    uint16_t *orig = p0, *end = p4;

    line_copy16(p0, srcp + 2 * stride, width, 2);
    line_copy16(p1, srcp + stride, width, 2);
    line_copy16(p2, srcp, width, 2);
    srcp += stride;
    line_copy16(p3, srcp, width, 2);

    __m128i zero = _mm_setzero_si128();
    __m128i all1 = _mm_cmpeq_epi32(zero, zero);
    __m128i one = _mm_srli_epi32(all1, 31);
    __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h);
    __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v);
    __m128 bias = _mm_set1_ps((float)ch->bias);

    __m128i matrix_h[5];
    __m128i matrix_v[5];
    int sign_h[5];
    int sign_v[5];
    for (int i = 0; i < 5; i++) {
        sign_h[i] = ch->m_h[i] < 0 ? 1 : 0;
        sign_v[i] = ch->m_v[i] < 0 ? 1 : 0;
        uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i];
        matrix_h[i] = _mm_set1_epi16((int16_t)val);
        val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i];
        matrix_v[i] = _mm_set1_epi16((int16_t)val);
    }

    for (int y = 0; y < height; y++) {
        srcp += stride * (y < height - 2 ? 1 : -1);
        line_copy16(p4, srcp, width, 2);

        for (int x = 0; x < width; x += 8) {
            uint16_t *array[] = {
                p0 + x, p1 + x, p2 + x, p3 + x, p4 + x,
                p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2
            };

            for (int j = 0; j < 2; j++) {
                __m128i *matrix = j == 0 ? matrix_v : matrix_h;
                int *sign = j == 0 ? sign_v : sign_h;
                __m128 rdiv = j == 0 ? rdiv_v : rdiv_h;
                __m128i sum[2];
                sum[0] = _mm_setzero_si128();
                sum[1] = _mm_setzero_si128();

                for (int i = 0; i < 5; i++) {
                    __m128i xmm0, xmm1, xmm2;

                    xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]);

                    xmm1 = _mm_mullo_epi16(xmm0, matrix[i]);
                    xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]);
                    xmm2 = _mm_unpacklo_epi16(xmm1, xmm0);
                    xmm0 = _mm_unpackhi_epi16(xmm1, xmm0);

                    if (sign[i]) {
                        xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1));
                        xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1));
                    }
                    sum[0] = _mm_add_epi32(sum[0], xmm2);
                    sum[1] = _mm_add_epi32(sum[1], xmm0);
                }

                for (int i = 0; i < 2; i++) {
                    __m128 sumfp;
                    __m128i mask, temp;
                    sumfp = _mm_cvtepi32_ps(sum[i]);
                    sumfp = _mm_mul_ps(sumfp, rdiv);
                    if (j == 1) {
                        sumfp = _mm_add_ps(sumfp, bias);
                    }
                    sum[i] = _mm_cvttps_epi32(sumfp);

                    temp = _mm_srli_epi32(all1, 16);
                    mask = _mm_cmplt_epi32(sum[i], temp);
                    sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask),
                                          _mm_andnot_si128(mask, temp));
                    mask = _mm_cmpgt_epi32(sum[i], zero);
                    if (ch->saturate) {
                        sum[i] = _mm_and_si128(mask, sum[i]);
                    } else {
                        temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1));
                        sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]),
                                              _mm_andnot_si128(mask, temp));
                    }
                }

                sum[0] = mm_cast_epi32(sum[0], sum[1]);

                _mm_store_si128((__m128i *)(dstp + x), sum[0]);
            }
        }
        dstp += stride;
        p0 = p1;
        p1 = p2;
        p2 = p3;
        p3 = p4;
        p4 = (p4 == end) ? orig : p4 + bstride;
    }
}