Пример #1
static void
thresh_32f( const Mat& _src, Mat& _dst, float thresh, float maxval, int type )
    int i, j;
    Size roi = _src.size();
    roi.width *= _src.channels();
    const float* src = (const float*)_src.data;
    float* dst = (float*)_dst.data;
    size_t src_step = _src.step/sizeof(src[0]);
    size_t dst_step = _dst.step/sizeof(dst[0]);

#if CV_SSE2
    volatile bool useSIMD = checkHardwareSupport(CV_CPU_SSE);

    if( _src.isContinuous() && _dst.isContinuous() )
        roi.width *= roi.height;
        roi.height = 1;

    if (tegra::thresh_32f(_src, _dst, roi.width, roi.height, thresh, maxval, type))

#if defined(HAVE_IPP)
    IppiSize sz = { roi.width, roi.height };
    switch( type )
    case THRESH_TRUNC:
        if (0 <= ippiThreshold_GT_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh))
        if (0 <= ippiThreshold_LTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh+FLT_EPSILON, 0))
        if (0 <= ippiThreshold_GTVal_32f_C1R(src, (int)src_step*sizeof(src[0]), dst, (int)dst_step*sizeof(dst[0]), sz, thresh, 0))

    switch( type )
        case THRESH_BINARY:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
                j = 0;
#if CV_SSE2
                if( useSIMD )
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmpgt_ps( v0, thresh4 );
                        v1 = _mm_cmpgt_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] > thresh ? maxval : 0;

        case THRESH_BINARY_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
                j = 0;
#if CV_SSE2
                if( useSIMD )
                    __m128 thresh4 = _mm_set1_ps(thresh), maxval4 = _mm_set1_ps(maxval);
                    for( ; j <= roi.width - 8; j += 8 )
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_cmple_ps( v0, thresh4 );
                        v1 = _mm_cmple_ps( v1, thresh4 );
                        v0 = _mm_and_ps( v0, maxval4 );
                        v1 = _mm_and_ps( v1, maxval4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );

                for( ; j < roi.width; j++ )
                    dst[j] = src[j] <= thresh ? maxval : 0;

        case THRESH_TRUNC:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
                j = 0;
#if CV_SSE2
                if( useSIMD )
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_min_ps( v0, thresh4 );
                        v1 = _mm_min_ps( v1, thresh4 );
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );

                for( ; j < roi.width; j++ )
                    dst[j] = std::min(src[j], thresh);

        case THRESH_TOZERO:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
                j = 0;
#if CV_SSE2
                if( useSIMD )
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmpgt_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmpgt_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );

                for( ; j < roi.width; j++ )
                    float v = src[j];
                    dst[j] = v > thresh ? v : 0;

        case THRESH_TOZERO_INV:
            for( i = 0; i < roi.height; i++, src += src_step, dst += dst_step )
                j = 0;
#if CV_SSE2
                if( useSIMD )
                    __m128 thresh4 = _mm_set1_ps(thresh);
                    for( ; j <= roi.width - 8; j += 8 )
                        __m128 v0, v1;
                        v0 = _mm_loadu_ps( src + j );
                        v1 = _mm_loadu_ps( src + j + 4 );
                        v0 = _mm_and_ps(v0, _mm_cmple_ps(v0, thresh4));
                        v1 = _mm_and_ps(v1, _mm_cmple_ps(v1, thresh4));
                        _mm_storeu_ps( dst + j, v0 );
                        _mm_storeu_ps( dst + j + 4, v1 );
                for( ; j < roi.width; j++ )
                    float v = src[j];
                    dst[j] = v <= thresh ? v : 0;
            return CV_Error( CV_StsBadArg, "" );
Пример #2
static void compute_step_tv_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], unsigned x, unsigned y, double *tv) {
        const __m128 minf = _mm_set_ps1(INFINITY);
        const __m128 mzero = _mm_set_ps1(0.);

        __m128 g_xs[3] = {0};
        __m128 g_ys[3] = {0};
        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];
                __m128 here = _mm_load_ps(p(aux->fdata, x, y, w, h));
                // forward gradient x
                g_xs[c] = _mm_loadu_ps(p(aux->fdata, x+1, y, w, h)) - here;
                // forward gradient y
                g_ys[c] = _mm_loadu_ps(p(aux->fdata, x, y+1, w, h)) - here;
        // norm
        __m128 g_norm = mzero;
        for(unsigned c = 0; c < nchannel; c++) {
                g_norm += SQR(g_xs[c]);
                g_norm += SQR(g_ys[c]);
        g_norm = _mm_sqrt_ps(g_norm);

        float alpha = 1./sqrtf(nchannel);
        *tv += alpha * g_norm[0];
        *tv += alpha * g_norm[1];
        *tv += alpha * g_norm[2];
        *tv += alpha * g_norm[3];

        __m128 malpha = _mm_set_ps1(alpha);

        // set zeroes to infinity
        g_norm = _mm_or_ps(g_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g_norm, mzero)));

        // compute derivatives
        for(unsigned c = 0; c < nchannel; c++) {
                __m128 g_x = g_xs[c];
                __m128 g_y = g_ys[c];
                struct aux *aux = &auxs[c];

                // N.B. for numerical stability and same exact result as the c version,
                // we must calculate the objective gradient at x+1 before x
                        float *pobj_r = p(aux->obj_gradient, x+1, y, w, h);
                        __m128 obj_r = _mm_loadu_ps(pobj_r);
                        obj_r += malpha * g_x / g_norm;
                        _mm_storeu_ps(pobj_r, obj_r);

                        float *pobj = p(aux->obj_gradient, x, y, w, h);
                        __m128 obj = _mm_load_ps(pobj);
                        obj += malpha * -(g_x + g_y) / g_norm;
                        _mm_store_ps(pobj, obj);

                        float *pobj_b = p(aux->obj_gradient, x, y+1, w, h);
                        __m128 obj_b = _mm_load_ps(pobj_b);
                        obj_b += malpha * g_y / g_norm;
                        _mm_store_ps(pobj_b, obj_b);
        // store
        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];
                _mm_store_ps(p(aux->temp[0], x, y, w, h), g_xs[c]);
                _mm_store_ps(p(aux->temp[1], x, y, w, h), g_ys[c]);
Пример #3
__m128 exp_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
   __m128 tmp = _mm_setzero_ps(), fx;
#ifdef USE_SSE2
   __m128i emm0;
   __m64 mm0, mm1;
   __m128 one = *(__m128*)_ps_1;

   x = _mm_min_ps(x, *(__m128*)_ps_exp_hi);
   x = _mm_max_ps(x, *(__m128*)_ps_exp_lo);

   /* express exp(x) as exp(g + n*log(2)) */
   fx = _mm_mul_ps(x, *(__m128*)_ps_cephes_LOG2EF);
   fx = _mm_add_ps(fx, *(__m128*)_ps_0p5);

   /* how to perform a floorf with SSE: just below */
#ifndef USE_SSE2
   /* step 1 : cast to int */
   tmp = _mm_movehl_ps(tmp, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(tmp);
   /* step 2 : cast back to float */
   tmp = _mm_cvtpi32x2_ps(mm0, mm1);
   emm0 = _mm_cvttps_epi32(fx);
   tmp  = _mm_cvtepi32_ps(emm0);
   /* if greater, substract 1 */
   __m128 mask = _mm_cmpgt_ps(tmp, fx);    
   mask = _mm_and_ps(mask, one);
   fx = _mm_sub_ps(tmp, mask);

   tmp = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C1);
   __m128 z = _mm_mul_ps(fx, *(__m128*)_ps_cephes_exp_C2);
   x = _mm_sub_ps(x, tmp);
   x = _mm_sub_ps(x, z);

   z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_exp_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_exp_p5);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, x);
   y = _mm_add_ps(y, one);

   /* build 2^n */
#ifndef USE_SSE2
   z = _mm_movehl_ps(z, fx);
   mm0 = _mm_cvttps_pi32(fx);
   mm1 = _mm_cvttps_pi32(z);
   mm0 = _mm_add_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_add_pi32(mm1, *(__m64*)_pi32_0x7f);
   mm0 = _mm_slli_pi32(mm0, 23); 
   mm1 = _mm_slli_pi32(mm1, 23);

   __m128 pow2n; 
   COPY_MM_TO_XMM(mm0, mm1, pow2n);
   emm0 = _mm_cvttps_epi32(fx);
   emm0 = _mm_add_epi32(emm0, *(__m128i*)_pi32_0x7f);
   emm0 = _mm_slli_epi32(emm0, 23);
   __m128 pow2n = _mm_castsi128_ps(emm0);
   y = _mm_mul_ps(y, pow2n);
   return y;
Пример #4
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK);

    //fprintf(stderr,"Using cmatrix codepath\n");
    // convert to rgb using matrix
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
    for(int j=0; j<roi_out->height; j++)

      float *in  = (float*)ivoid + (size_t)ch*roi_in->width *j;
      float *out = (float*)ovoid + (size_t)ch*roi_out->width*j;
      const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]);

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2)))));

    // apply profile
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
    for(int j=0; j<roi_out->height; j++)

      float *in  = (float*)ivoid + (size_t)ch*roi_in->width *j;
      float *out = (float*)ovoid + (size_t)ch*roi_out->width*j;

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
        for(int i=0; i<3; i++)
          if (d->lut[i][0] >= 0.0f)
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
    //fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out)
    for (int k=0; k<roi_out->height; k++)
      const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

        cmsDoTransform(d->xform, in, out, roi_out->width);
      } else {
        void *rgb = dt_alloc_align(16, 4*sizeof(float)*roi_out->width);
        cmsDoTransform(d->xform, in, rgb, roi_out->width);
        float *rgbptr = (float *)rgb;
        for (int j=0; j<roi_out->width; j++,rgbptr+=4,out+=4)
          const __m128 pixel = _mm_load_ps(rgbptr);
          const __m128 ingamut = _mm_cmpge_ps(pixel, _mm_setzero_ps());
          const __m128 result = _mm_or_ps(_mm_andnot_ps(ingamut, outofgamutpixel),
                                          _mm_and_ps(ingamut, pixel));
          _mm_stream_ps(out, result);

    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
Пример #5
/* Function:  esl_sse_expf()
 * Synopsis:  <r[z] = exp x[z]>
 * Incept:    SRE, Fri Dec 14 14:46:27 2007 [Janelia]
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = expf(x[z])>.
 *            Valid for all IEEE754 floats $x_z$.
 * Xref:      J2/71
 *            J10/62: bugfix, minlogf/maxlogf range was too wide;
 *                    (k+127) must be >=0 and <=255, so (k+127)<<23
 *                    is a valid IEEE754 float, without touching
 *                    the sign bit. Pommier had this right in the
 *                    first place, and I didn't understand.
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2.
 *            Note on maxlogf/minlogf, which are close to but not
 *            exactly 127.5/log2 [J10/63]. We need -127<=k<=128, so
 *            k+127 is 0..255, a valid IEEE754 8-bit exponent
 *            (0..255), so the bit pattern (k+127)<<23 is IEEE754
 *            single-precision for 2^k.  If k=-127, we get IEEE754 0.
 *            If k=128, we get IEEE754 +inf.  If k<-127, k+127 is
 *            negative and we get screwed up.  If k>128, k+127
 *            overflows the 8-bit exponent and sets the sign bit.  So
 *            for x' (base 2) < -127.5 we must definitely return e^x ~
 *            0; for x' < 126.5 we're going to calculate 0 anyway
 *            (because k=floor(-126.5-epsilon+0.5) = -127).  So any
 *            minlogf between -126.5 log2 ... -127.5 log2 will suffice
 *            as the cutoff. Ditto for 126.5 log2 .. 127.5log2.
 *            That's 87.68312 .. 88.3762655.  I think Pommier's
 *            thinking is, you don't want to get to close to the
 *            edges, lest fp roundoff error screw you (he may have
 *            consider 1 ulp carefully, I can't tell), but otherwise
 *            you may as well put your bounds close to the outer edge;
 *            so
 *              maxlogf =  127.5 log(2) - epsilon
 *              minlogf = -127.5 log(2) + epsilon
 *            for an epsilon that happen to be ~ 3e-6.
esl_sse_expf(__m128 x)
    static float cephes_p[6] = { 1.9875691500E-4f, 1.3981999507E-3f, 8.3334519073E-3f,
                                 4.1665795894E-2f, 1.6666665459E-1f, 5.0000001201E-1f
    static float cephes_c[2] = { 0.693359375f,    -2.12194440e-4f };
    static float maxlogf     =  88.3762626647949f;  /* 127.5 log(2) - epsilon. above this, 0.5+x/log2 gives k>128 and breaks 2^k "float" construction, because (k+127)<<23 must be a valid IEEE754 exponent 0..255 */
    static float minlogf     = -88.3762626647949f;  /*-127.5 log(2) + epsilon. below this, 0.5+x/log2 gives k<-127 and breaks 2^k, see above */
    __m128i k;
    __m128  mask, tmp, fx, z, y, minmask, maxmask;

    /* handle out-of-range and special conditions */
    maxmask = _mm_cmpgt_ps(x, _mm_set1_ps(maxlogf));
    minmask = _mm_cmple_ps(x, _mm_set1_ps(minlogf));

    /* range reduction: exp(x) = 2^k e^f = exp(f + k log 2); k = floorf(0.5 + x / log2): */
    fx = _mm_mul_ps(x,  _mm_set1_ps(eslCONST_LOG2R));
    fx = _mm_add_ps(fx, _mm_set1_ps(0.5f));

    /* floorf() with SSE:  */
    k    = _mm_cvttps_epi32(fx);	              /* cast to int with truncation                  */
    tmp  = _mm_cvtepi32_ps(k);	              /* cast back to float                           */
    mask = _mm_cmpgt_ps(tmp, fx);               /* if it increased (i.e. if it was negative...) */
    mask = _mm_and_ps(mask, _mm_set1_ps(1.0f)); /* ...without a conditional branch...           */
    fx   = _mm_sub_ps(tmp, mask);	              /* then subtract one.                           */
    k    = _mm_cvttps_epi32(fx);	              /* k is now ready for the 2^k part.             */

    /* polynomial approx for e^f for f in range [-0.5, 0.5] */
    tmp = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[0]));
    z   = _mm_mul_ps(fx, _mm_set1_ps(cephes_c[1]));
    x   = _mm_sub_ps(x, tmp);
    x   = _mm_sub_ps(x, z);
    z   = _mm_mul_ps(x, x);

    y =               _mm_set1_ps(cephes_p[0]);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, _mm_set1_ps(1.0f));

    /* build 2^k by hand, by creating a IEEE754 float */
    k  = _mm_add_epi32(k, _mm_set1_epi32(127));
    k  = _mm_slli_epi32(k, 23);
    fx = _mm_castsi128_ps(k);

    /* put 2^k e^f together (fx = 2^k,  y = e^f) and we're done */
    y = _mm_mul_ps(y, fx);

    /* special/range cleanup */
    y = esl_sse_select_ps(y, _mm_set1_ps(eslINFINITY), maxmask); /* exp(x) = inf for x > log(2^128)  */
    y = esl_sse_select_ps(y, _mm_set1_ps(0.0f),        minmask); /* exp(x) = 0   for x < log(2^-149) */
    return y;
Пример #6
// logical operators
RETf AND( const __m128 x, const __m128 y ) { return _mm_and_ps(x,y); }
Пример #7
	IntersectionData intersectRaySpheres(const Ray& ray, const vector<int>& spheresIndices,
			const Spheres& spheres)
		const int maxSpheresToCheck = 4;
		IntersectionData result;
		result.intersection = false;
		result.tIntersection = numeric_limits<float>::max();

		int remainder = spheresIndices.size() % maxSpheresToCheck;
		bool canUseSIMD = (remainder < spheresIndices.size());

		int nonSIMDStartPos = 0;

			const int spheresToSIMDCheck = spheresIndices.size() - remainder;
			nonSIMDStartPos = spheresToSIMDCheck;
			//Vec4Float a = _mm_set1_ps(1.f); when rayDir is normalized a is 1
			Vec4Float b = _mm_set1_ps(0.f);
			Vec4Float c = b;
			Vec4Float D = c;

			Vec4Float centerCoords[3], radiuses;

			for(int i = 0; i < spheresToSIMDCheck; i += 4)
				for(int j = 0; j < 3; ++j)
					centerCoords[j] = _mm_set_ps(
							spheres.centerCoords[j][spheresIndices[i]], spheres.centerCoords[j][spheresIndices[i + 1]],
							spheres.centerCoords[j][spheresIndices[i + 2]], spheres.centerCoords[j][spheresIndices[i + 3]]

					radiuses = _mm_set_ps(
							spheres.radiuses[spheresIndices[i]], spheres.radiuses[spheresIndices[i + 1]],
							spheres.radiuses[spheresIndices[i + 2]], spheres.radiuses[spheresIndices[i + 2]]

					b += 2.f * ray.direction.coords[j] * (ray.origin.coords[j] - centerCoords[j]);
					c += (ray.origin.coords[j] - centerCoords[j]) * (ray.origin.coords[j] - centerCoords[j]);
				D = b * b - 4.f * c;

				Vec4Float mask = _mm_cmpge_ps(D, _mm_set_ps1(0.f));
				Vec4Float squareRootD = _mm_sqrt_ps(D);
				D = _mm_and_ps(squareRootD, mask);

				Vec4Float t1, t2;
				t1 = _mm_or_ps((-b - squareRootD) * 0.5f, _mm_andnot_ps(mask, D));
				t2 = _mm_or_ps((-b + squareRootD) * 0.5f, _mm_andnot_ps(mask, D));

				float tRes = result.tIntersection;
				for(int j = 0; j < 4; ++j)
					if(t1[j] >= 0 && t1[j] < tRes)
						tRes = t1[j];
					if(t2[j] >= 0 && t2[j] < tRes)
						tRes = t2[j];

				if(tRes	< result.tIntersection)

					result.intersection = true;
					result.tIntersection = tRes;

			for(int i = nonSIMDStartPos; i < spheresIndices.size(); ++i)
				IntersectionData data;
				int idx = spheresIndices[i];
				Sphere sphere;
				sphere.center.x = spheres.centerCoords[0][idx];
				sphere.center.y = spheres.centerCoords[1][idx];
				sphere.center.z = spheres.centerCoords[2][idx];
				sphere.radius = spheres.radiuses[idx];
				data = intersectSingleSphere(ray, sphere);

				if(data.intersection && data.tIntersection < result.tIntersection)
					result = data;

			return result;
Пример #8
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->mode == DT_PROFILE_GAMUTCHECK);

  if(d->type == DT_COLORSPACE_LAB)
    memcpy(ovoid, ivoid, sizeof(float)*4*roi_out->width*roi_out->height);
  else if(!isnan(d->cmatrix[0]))
// fprintf(stderr,"Using cmatrix codepath\n");
// convert to rgb using matrix
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
    for(int j = 0; j < roi_out->height; j++)

      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;
      const __m128 m0 = _mm_set_ps(0.0f, d->cmatrix[6], d->cmatrix[3], d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f, d->cmatrix[7], d->cmatrix[4], d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f, d->cmatrix[8], d->cmatrix[5], d->cmatrix[2]);

      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t
            = _mm_add_ps(_mm_mul_ps(m0, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(0, 0, 0, 0))),
                         _mm_add_ps(_mm_mul_ps(m1, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(1, 1, 1, 1))),
                                    _mm_mul_ps(m2, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(2, 2, 2, 2)))));

        _mm_stream_ps(out, t);

    process_fastpath_apply_tonecurves(self, piece, ivoid, ovoid, roi_in, roi_out);
    // fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
    for(int k = 0; k < roi_out->height; k++)
      const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width;

      cmsDoTransform(d->xform, in, out, roi_out->width);

        for(int j = 0; j < roi_out->width; j++, out += 4)
          const __m128 pixel = _mm_load_ps(out);
          __m128 ingamut = _mm_cmplt_ps(pixel, _mm_set_ps(-FLT_MAX, 0.0f, 0.0f, 0.0f));

          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));
          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));

          const __m128 result
              = _mm_or_ps(_mm_and_ps(ingamut, outofgamutpixel), _mm_andnot_ps(ingamut, pixel));
          _mm_stream_ps(out, result);

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
Пример #9
/* natural logarithm computed for 4 simultaneous float
   return NaN for x <= 0
__m128 log_ps(__m128 x) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4si emm0;
    v4sf one = constants::ps_1.ps;

    v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

    x = _mm_max_ps(x, constants::min_norm_pos.ps);  // cut off denormalized stuff

    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
    // keep only the fractional part
    x = _mm_and_ps(x, constants::inv_mant_mask.ps);
    x = _mm_or_ps(x,  constants::ps_0p5.ps);

    emm0 = _mm_sub_epi32(emm0, constants::pi32_0x7f.pi);
    v4sf e = _mm_cvtepi32_ps(emm0);

    e = _mm_add_ps(e, one);

    /* part2:
       if( x < SQRTHF ) {
         e -= 1;
         x = x + x - 1.0;
       } else { x = x - 1.0; }
    v4sf mask = _mm_cmplt_ps(x, constants::cephes_SQRTHF.ps);
    v4sf tmp = _mm_and_ps(x, mask);
    x = _mm_sub_ps(x, one);
    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
    x = _mm_add_ps(x, tmp);

    v4sf z = _mm_mul_ps(x,x);

    v4sf y = constants::cephes_log_p0.ps;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p1.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p2.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p3.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p4.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p5.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p6.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p7.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p8.ps);
    y = _mm_mul_ps(y, x);

    y = _mm_mul_ps(y, z);

    tmp = _mm_mul_ps(e, constants::cephes_log_q1.ps);
    y = _mm_add_ps(y, tmp);

    tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);

    tmp = _mm_mul_ps(e, constants::cephes_log_q2.ps);
    x = _mm_add_ps(x, y);
    x = _mm_add_ps(x, tmp);
    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
    return x;
// Transforms the AABB vertices to screen space once every frame
// Also performs a coarse depth pre-test
PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary)
	// w ends up being garbage, but it doesn't matter - we ignore it anyway.
	__m128 vCenter = _mm_loadu_ps(&mBBCenter.x);
	__m128 vHalf   = _mm_loadu_ps(&mBBHalf.x);

	__m128 vMin    = _mm_sub_ps(vCenter, vHalf);
	__m128 vMax    = _mm_add_ps(vCenter, vHalf);

	// transforms
	__m128 xRow[2], yRow[2], zRow[2];
	xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0];
	xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0];
	yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1];
	yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1];
	zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2];
	zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2];

	__m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0));
	__m128 screenMin = _mm_set1_ps(FLT_MAX);
	__m128 screenMax = _mm_set1_ps(-FLT_MAX);

	for(UINT i = 0; i < AABB_VERTICES; i++)
		// Transform the vertex
		__m128 vert = cumulativeMatrix[3];
		vert += xRow[sBBxInd[i]];
		vert += yRow[sBByInd[i]];
		vert += zRow[sBBzInd[i]];

		// We have inverted z; z is in front of near plane iff z <= w.
		__m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz
		__m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww
		__m128 zIn = _mm_cmple_ps(vertZ, vertW);
		zAllIn = _mm_and_ps(zAllIn, zIn);

		// project
		xformedPos[i] = _mm_div_ps(vert, vertW);
	    // update bounds
	    screenMin = _mm_min_ps(screenMin, xformedPos[i]);
	    screenMax = _mm_max_ps(screenMax, xformedPos[i]);

	// if any of the verts are z-clipped, we (conservatively) say the box is in
	if(_mm_movemask_ps(zAllIn) != 0xf)
		return ePT_VISIBLE;

	// Clip against screen bounds
	screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX));
	screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX));

	// Quick rejection test
	if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin)))
		return ePT_INVISIBLE;

	// Prepare integer bounds
	__m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY
	__m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY);
	__m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3);

	__m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa);

	// Traverse all 8x8 blocks covered by 2d screen-space BBox;
	// if we know for sure that this box is behind the geometry we know is there,
	// we can stop.
	int rX0 = minMaxXYis.m128i_i32[0];
	int rY0 = minMaxXYis.m128i_i32[1];
	int rX1 = minMaxXYis.m128i_i32[2];
	int rY1 = minMaxXYis.m128i_i32[3];

	__m128 anyCloser = _mm_setzero_ps();
	for(int by = rY0; by <= rY1; by++)
		const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE);

		// If for any 8x8 block, maxZ is not less than (=behind) summarized
		// min Z, box might be visible.
		for(int bx = rX0; bx <= rX1; bx++)
			anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx])));

			return ePT_UNSURE; // okay, box might be in

	// If we get here, we know for sure that the box is fully behind the stuff in the
	// depth buffer.
	return ePT_INVISIBLE;
Пример #11
 */static inline __m128
_mm_abs_ps(__m128 t)
    static const uint32_t signmask[4] __attribute__((aligned(SSE_ALIGNMENT))) = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff};
    return _mm_and_ps(*(__m128*)signmask, t);
Пример #12
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const
	assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!");
	/* For reference, this is the C++ form of the vectorized SSE code below.

	float4 recipDir = rayDir.RecipFast4();
	float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir);
	float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir);
	float4 near = t1.Min(t2);
	float4 far = t1.Max(t2);
	float4 rayDirAbs = rayDir.Abs();

	if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question
		tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB.
	else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question
		tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB.
	else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question
		tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB.
	else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box.
		return false;

	return tNear < tFar;

	__m128 recipDir = _mm_rcp_ps(rayDir.v);
	// Note: The above performs an approximate reciprocal (11 bits of precision).
	// For a full precision reciprocal, perform a div:
//	__m128 recipDir = _mm_div_ps(_mm_set1_ps(1.f), rayDir.v);

	__m128 t1 = _mm_mul_ps(_mm_sub_ps(MinPoint_SSE(), rayPos.v), recipDir);
	__m128 t2 = _mm_mul_ps(_mm_sub_ps(MaxPoint_SSE(), rayPos.v), recipDir);

	__m128 nearD = _mm_min_ps(t1, t2); // [0 n3 n2 n1]
	__m128 farD = _mm_max_ps(t1, t2);  // [0 f3 f2 f1]

	// Check if the ray direction is parallel to any of the cardinal axes, and if so,
	// mask those [near, far] ranges away from the hit test computations.
	__m128 rayDirAbs = abs_ps(rayDir.v);

	const __m128 epsilon = _mm_set1_ps(1e-4f);
	// zeroDirections[i] will be nonzero for each axis i the ray is parallel to.
	__m128 zeroDirections = _mm_cmple_ps(rayDirAbs, epsilon);

	const __m128 floatInf = _mm_set1_ps(FLOAT_INF);
	const __m128 floatNegInf = _mm_set1_ps(-FLOAT_INF);

	// If the ray is parallel to one of the axes, replace the slab range for that axis
	// with [-inf, inf] range instead. (which is a no-op in the comparisons below)
	nearD = cmov_ps(nearD, floatNegInf, zeroDirections);
	farD = cmov_ps(farD , floatInf, zeroDirections);

	// Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2])
	// to see if there is an overlap in the hit ranges.
	__m128 v1 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(0, 0, 0, 0)); // [f1 f1 n1 n1]
	__m128 v2 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(1, 1, 1, 1)); // [f2 f2 n2 n2]
	__m128 v3 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(2, 2, 2, 2)); // [f3 f3 n3 n3]
	nearD = _mm_max_ps(v1, _mm_max_ps(v2, v3));
	farD = _mm_min_ps(v1, _mm_min_ps(v2, v3));
	farD = _mm_shuffle_ps(farD, farD, _MM_SHUFFLE(3, 3, 3, 3)); // Unpack the result from high offset in the register.
	nearD = _mm_max_ps(nearD, _mm_set_ss(tNear));
	farD = _mm_min_ps(farD, _mm_set_ss(tFar));

	// Finally, test if the ranges overlap.
	__m128 rangeIntersects = _mm_cmple_ss(nearD, farD);

	// To store out out the interval of intersection, uncomment the following:
	// These are disabled, since without these, the whole function runs without a single memory store,
	// which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown.
	// For now, using the SSE version only where the tNear and tFar ranges are not interesting.
//	_mm_store_ss(&tNear, nearD);
//	_mm_store_ss(&tFar, farD);

	// To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction
	// is parallel to.
	__m128 out2 = _mm_cmplt_ps(rayPos.v, MinPoint_SSE());
	__m128 out3 = _mm_cmpgt_ps(rayPos.v, MaxPoint_SSE());
	out2 = _mm_or_ps(out2, out3);
	zeroDirections = _mm_and_ps(zeroDirections, out2);

	__m128 yOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(1,1,1,1));
	__m128 zOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(2,2,2,2));

	zeroDirections = _mm_or_ps(_mm_or_ps(zeroDirections, yOut), zOut);
	// Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being
	// parallel to some cardinal axis.
	__m128 intersects = _mm_andnot_ps(zeroDirections, rangeIntersects);
	__m128 epsilonMasked = _mm_and_ps(epsilon, intersects);
	return _mm_comieq_ss(epsilon, epsilonMasked) != 0;
Пример #13
 SIMD_INLINE __m128 ValidSqrt(__m128 value)
     __m128 mask = _mm_cmpgt_ps(value, _mm_set1_ps(0.0f));
     return _mm_sqrt_ps(_mm_or_ps(_mm_and_ps(mask, value), _mm_andnot_ps(mask, _mm_set1_ps(1.0f))));
Пример #14
// returns whether the shape was drawn
eBool eLSystem::drawShapes(eMesh& destMesh, tDrawState& state, const tTurtleState& turtle0, const tTurtleState& turtle1, eF32 shapeLen, eF32 stexY0, eF32 stexY1, eBool forceDraw, eU32 numParts) {
    eF32 partLen = eLerp(this->m_sizePar * (eF32)numParts, 0.0001f, detail);
//	eF32 partLen = eLerp(eF32_MAX, 0.0001f, detail);
	if(partLen <= 0.0f)
		partLen = eALMOST_ZERO;
	eF32 numToDrawF = (eF32)shapeLen / partLen;
	if(!forceDraw) {
		if(numToDrawF <= 1.0f)
			return false;
	eU32 numDraw = eCeil(eClamp(1.0f, numToDrawF, (eF32)m_gen_rings)); 

	eU32		numFaces = numDraw * m_gen_edges * 2;
	eU32		faceNr = 0;

    ePROFILER_ZONE("L-System - Draw Shapes");

    __declspec(align(16)) const eVector3  control0 = turtle0.position;
	__declspec(align(16)) const eVector3  control1 = control0   + turtle0.rotation.getVector(2) * 0.333333f * shapeLen;
    __declspec(align(16)) const eVector3  control3 = turtle1.position;
	__declspec(align(16)) const eVector3  control2 = control3   - turtle1.rotation.getVector(2) * 0.333333f * shapeLen;

    eF32 rscale0 = turtle0.size * turtle0.width;
	eF32 rscale1 = turtle1.size * turtle1.width;
	for(eU32 d = 0; d < numDraw; d++) {
		eF32 t0 = ((eF32)d / (eF32)numDraw);
		eF32 t1 = ((eF32)(d + 1) / (eF32)numDraw);
		for(eU32 r = 0; r <= 1; r++) {
			eF32 tt = (r == 0) ? t0 : t1;
			eF32 rscale = eLerp(rscale0, rscale1, tt);
			if((r != 0) || (state.lastVertices->size() == 0)) {
                // create ring vertices
                __declspec(align(16)) eVector3 position;
                __declspec(align(16)) eVector3 normal;

                // calculate bezier curve position
                __m128 mt = _mm_set1_ps(tt);
                __m128 mtinv = _mm_set1_ps(1.0f - tt);
                __m128 mcp0 = _mm_load_ps(&control0.x);
                __m128 mcp1 = _mm_load_ps(&control1.x);
                __m128 m0 = _mm_add_ps(_mm_mul_ps(mcp0, mtinv), _mm_mul_ps(mcp1, mt));
                __m128 mcp2 = _mm_load_ps(&control2.x);
                __m128 m1 = _mm_add_ps(_mm_mul_ps(mcp1, mtinv), _mm_mul_ps(mcp2, mt));
                __m128 mm0 = _mm_add_ps(_mm_mul_ps(m0, mtinv), _mm_mul_ps(m1, mt));
                __m128 mcp3 = _mm_load_ps(&control3.x);
                __m128 m2 = _mm_add_ps(_mm_mul_ps(mcp2, mtinv), _mm_mul_ps(mcp3, mt));
                __m128 mm1 = _mm_add_ps(_mm_mul_ps(m1, mtinv), _mm_mul_ps(m2, mt));
                __m128 bezCurvePosition = _mm_add_ps(_mm_mul_ps(mm0, mtinv), _mm_mul_ps(mm1, mt));

                // calculate bezier tangent
                __m128 vec3mask = _mm_set_ps(0x0,0xFFFFFFFF,0xFFFFFFFF, 0xFFFFFFFF);
                __m128 mrestangent = _mm_and_ps(_mm_sub_ps(mm1, mm0), vec3mask);

                __m128 mdot = _mm_mul_ps(mrestangent, mrestangent);
                __m128 mdotagg = _mm_hadd_ps(mdot, mdot);
                __m128 recipsqrt = _mm_rsqrt_ss( _mm_hadd_ps(mdotagg, mdotagg) );
                __m128 tangentnorm = _mm_mul_ps(mrestangent, _mm_shuffle_ps(recipsqrt, recipsqrt, _MM_SHUFFLE(0,0,0,0)));

                // get look vector on axis 2 (ringRot.getVector(2))
                eQuat       ringRot = turtle0.rotation.slerp(tt, turtle1.rotation);
                __m128 mRingRot = _mm_loadu_ps((eF32*)&ringRot);
                __m128 rrmulparts = _mm_mul_ps(mRingRot, _mm_shuffle_ps(mRingRot, mRingRot, _MM_SHUFFLE(0,1,3,2)));

                __m128 ringRotSqr = _mm_mul_ps(mRingRot, mRingRot);
                __m128 mrdotagg = _mm_hadd_ps(rrmulparts, ringRotSqr);
                __m128 mrdotaggshuf = _mm_shuffle_ps(mrdotagg, mrdotagg, _MM_SHUFFLE(0,2,0,2));
                __m128 mrrotz = _mm_hsub_ps(rrmulparts, mrdotaggshuf);
                __m128 rrecipsqrt = _mm_rsqrt_ss( _mm_hadd_ps(mrdotagg, mrdotagg) );

                __m128 maxisparts = _mm_shuffle_ps(mrrotz,mrdotagg, _MM_SHUFFLE(0,2,0,2)); // -Y-X
                __m128 maxisparts2 = _mm_add_ps(maxisparts, maxisparts); // -Y*2-X*2
                __m128 maxispartsfinal = _mm_shuffle_ps(mrrotz,maxisparts2,_MM_SHUFFLE(0,0,2,0)); //ZZY*2X*2
                __m128 mlook = _mm_and_ps(_mm_mul_ps(maxispartsfinal, _mm_shuffle_ps(rrecipsqrt, rrecipsqrt, _MM_SHUFFLE(0,0,0,0))), vec3mask);

                // calculate side vector (look ^ tangent)
                __m128 mside = _mm_sub_ps(
                    _mm_mul_ps(_mm_shuffle_ps(mlook, mlook, _MM_SHUFFLE(3, 0, 2, 1)), _mm_shuffle_ps(tangentnorm, tangentnorm, _MM_SHUFFLE(3, 1, 0, 2))),
                    _mm_mul_ps(_mm_shuffle_ps(mlook, mlook, _MM_SHUFFLE(3, 1, 0, 2)), _mm_shuffle_ps(tangentnorm, tangentnorm, _MM_SHUFFLE(3, 0, 2, 1)))

                // normalize side vector
                mdot = _mm_mul_ps(mside, mside);
                mdotagg = _mm_hadd_ps(mdot, mdot);
                __m128 dotsum = _mm_hadd_ps(mdotagg, mdotagg);
                const eF32 sideLenSqr = dotsum.m128_f32[0];
                if(sideLenSqr > eALMOST_ZERO) {
                    recipsqrt = _mm_rsqrt_ss( dotsum );
                    __m128 sidenorm = _mm_mul_ps(mside, _mm_shuffle_ps(recipsqrt, recipsqrt, _MM_SHUFFLE(0,0,0,0)));

                    // calc dot product (look * tangent)
                    __m128 dotprod = _mm_mul_ps(mlook, sidenorm);
                    __m128 dph0 = _mm_hadd_ps(dotprod, dotprod);
                    __m128 dph1 = _mm_hadd_ps(dph0, dph0);
                    const eF32 dot = eClamp(-1.0f, dph1.m128_f32[0], 1.0f);
		            eF32 alpha = eACos(dot) * (1.0f / (2.0f * ePI));

                    eQuat rotation(sidenorm, alpha);
		            ringRot = rotation * ringRot;

				eMatrix4x4 curveMat(ringRot);
                __declspec(align(16)) eVector3 ringX = curveMat.getVector(0);
                __declspec(align(16)) eVector3 ringY = curveMat.getVector(1);

				eF32 texY = eLerp(stexY0, stexY1, tt);
                const eF32 texXStep = 1.0f / m_gen_edges;
                eVector2 texPos(0, texY);
                __m128 mRingX = _mm_load_ps(&ringX.x);
                __m128 mRingY = _mm_load_ps(&ringY.x);
                __m128 mScale = _mm_set1_ps(rscale);

				for(eU32 e = 0; e <= m_gen_edges * 2; e += 2) {
                    __m128 msin = _mm_set1_ps(m_gen_edge_sinCosTable[e]);
                    __m128 mcos = _mm_set1_ps(m_gen_edge_sinCosTable[e+1]);
                    __m128 mnormal = _mm_add_ps(_mm_mul_ps(mRingX, msin), _mm_mul_ps(mRingY, mcos));
                    _mm_store_ps(&normal.x, mnormal);
                    __m128 mposition = _mm_add_ps(bezCurvePosition, _mm_mul_ps(mnormal, mScale));
                    _mm_store_ps(&position.x, mposition);

                    state.curVertices->append(destMesh.addVertex(position, normal, texPos));
                    texPos.x += texXStep;

				// connect triangles
				if(r != 0) {
					eF32 texY0 = eLerp(stexY0, stexY1, t0);
					eF32 texY1 = eLerp(stexY0, stexY1, t1);
					for(eU32 e = 0; e < m_gen_edges; e++) {
							                 (*state.curVertices)[e + 1], 
											 (*state.lastVertices)[e + 1],
							                 (*state.lastVertices)[e + 1], 

				state.lastVertices = state.curVertices;
				eSwap(state.curVertices, state.curTempVertices);
	return true;
Пример #15
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  // FIXME: turn off the module instead?
  if(!dev->overexposed.enabled || !dev->gui_attached)
    memcpy(ovoid, ivoid, (size_t)roi_out->width*roi_out->height*sizeof(float)*ch);

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
  for(int k=0; k<roi_out->height; k++)
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);

    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
Пример #16
int main()
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	unsigned int csr = _mm_getcsr();
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("%d tests failed!\n", numFailures);
Пример #17
inline float4 operator&&(const float4& a, const float4& b)
	return float4(_mm_and_ps(a.data, b.data));
Пример #18
/* evaluation of 4 sines at onces, using only SSE2.

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

   Performance is also surprisingly good, 1.33 times faster than the
   macos vsinf SSE2 function, and 1.5 times faster than the
   __vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
   too bad for an SSE1 function (with no special tuning) !
   However the latter libraries probably have a much better handling of NaN,
   Inf, denormalized and other special arguments..

   On my core 1 duo, the execution of this function takes approximately 95 cycles.

   From what I have observed on the experiments with Intel AMath lib, switching to an
   SSE2 version would improve the perf by only 10%.

   Since it is based on SSE intrinsics, it has to be compiled at -O2 to
   deliver full speed.
__m128 sin_ps(__m128 x) { // any x
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;

    v4si emm0, emm2;
    sign_bit = x;
    /* take the absolute value */
    x = _mm_and_ps(x, constants::inv_mant_mask.ps);
    /* extract the sign bit (upper one) */
    sign_bit = _mm_and_ps(sign_bit, constants::sign_mask.ps);

    /* scale by 4/Pi */
    y = _mm_mul_ps(x, constants::cephes_FOPI.ps);

    /* store the integer part of y in mm0 */
    emm2 = _mm_cvttps_epi32(y);
    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi);
    emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi);
    y = _mm_cvtepi32_ps(emm2);
    /* get the swap sign flag */
    emm0 = _mm_and_si128(emm2, constants::pi32_4.pi);
    emm0 = _mm_slli_epi32(emm0, 29);
    /* get the polynom selection mask
       there is one polynom for 0 <= x <= Pi/4
       and another one for Pi/4<x<=Pi/2

       Both branches will be computed.
    emm2 = _mm_and_si128(emm2, constants::pi32_2.pi);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

    v4sf swap_sign_bit = _mm_castsi128_ps(emm0);
    v4sf poly_mask = _mm_castsi128_ps(emm2);
    sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);

    /* The magic pass: "******"
    x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = constants::minus_cephes_DP1.ps;
    xmm2 = constants::minus_cephes_DP2.ps;
    xmm3 = constants::minus_cephes_DP3.ps;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
    y = constants::coscof_p0.ps;
    v4sf z = _mm_mul_ps(x,x);

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p1.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p2.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, constants::ps_1.ps);

    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

    v4sf y2 = constants::sincof_p0.ps;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p1.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p2.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    /* select the correct result from the two polynoms */
    xmm3 = poly_mask;
    y2 = _mm_and_ps(xmm3, y2); //, xmm3);
    y = _mm_andnot_ps(xmm3, y);
    y = _mm_add_ps(y,y2);
    /* update the sign */
    y = _mm_xor_ps(y, sign_bit);

    return y;
Пример #19
/* A vectorized version of the Voigt function using X86 SSE instructions */
void my_voigt(const float *damping, const float *frequency_offset, float *voigt_value, int N)   
    // coefficients of the rational approximation formula
    // to the complementary error function
    const __m128 A0 = _mm_set1_ps(122.607931777104326f);
    const __m128 A1 = _mm_set1_ps(214.382388694706425f);
    const __m128 A2 = _mm_set1_ps(181.928533092181549f);
    const __m128 A3 = _mm_set1_ps(93.155580458138441f);
    const __m128 A4 = _mm_set1_ps(30.180142196210589f);
    const __m128 A5 = _mm_set1_ps(5.912626209773153f);
    const __m128 A6 = _mm_set1_ps(0.564189583562615f);
    const __m128 B0 = _mm_set1_ps(122.60793177387535f);
    const __m128 B1 = _mm_set1_ps(352.730625110963558f);
    const __m128 B2 = _mm_set1_ps(457.334478783897737f);
    const __m128 B3 = _mm_set1_ps(348.703917719495792f);
    const __m128 B4 = _mm_set1_ps(170.354001821091472f);
    const __m128 B5 = _mm_set1_ps(53.992906912940207f);
    const __m128 B6 = _mm_set1_ps(10.479857114260399f);

    __m128 ivsigno;
    __m128 V;
    __m128 Z1_real;
    __m128 Z1_imag;
    __m128 Z2_real;
    __m128 Z2_imag;
    __m128 Z3_real;
    __m128 Z3_imag;
    __m128 Z4_real;
    __m128 Z4_imag;
    __m128 Z5_real;
    __m128 Z5_imag;
    __m128 Z6_real;
    __m128 Z6_imag;
    __m128 ZZ1_real;
    __m128 ZZ1_imag;
    __m128 ZZ2_real;
    __m128 ZZ2_imag;
    __m128 ZZ3_real;
    __m128 ZZ3_imag;
    __m128 ZZ4_real;
    __m128 ZZ4_imag;
    __m128 ZZ5_real;
    __m128 ZZ5_imag;
    __m128 ZZ6_real;
    __m128 ZZ6_imag;
    __m128 ZZ7_real;
    __m128 ZZ7_imag;
    __m128 division_factor;
    __m128 ZZZ_real;
    __m128 damp;
    __m128 offs;
    __m128 vval;
    __m128 one = _mm_set1_ps(1.0f); 
    __m128 zero = _mm_set1_ps(0.0f);
    __m128 mone = _mm_set1_ps(-1.0f);
    __m128 half = _mm_set1_ps(-0.5f);
    __m128 mask;

    float *stmp = (float *) _mm_malloc(4*sizeof(float), 16);

    int i;
    for(i=0; i<N; i+=VECLEN){
        _mm_prefetch((const char *)&damping[i+64], _MM_HINT_T0);
        _mm_prefetch((const char *)&frequency_offset[i+64], _MM_HINT_T0);
        damp = _mm_load_ps(&damping[i]);
        offs = _mm_load_ps(&frequency_offset[i]);
        mask = _mm_cmplt_ps(offs, zero);
        ivsigno = _mm_add_ps(_mm_and_ps(mask,mone),_mm_andnot_ps(mask,one));
        V = _mm_mul_ps(ivsigno, offs);       

        Z1_real = _mm_add_ps(_mm_mul_ps(A6, damp), A5);
        Z1_imag = _mm_mul_ps(A6, V);
        Z2_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z1_real, damp), _mm_mul_ps(Z1_imag, V)), A4);
        Z2_imag = _mm_add_ps(_mm_mul_ps(Z1_real, V), _mm_mul_ps(Z1_imag, damp));
        Z3_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z2_real, damp), _mm_mul_ps(Z2_imag, V)), A3);
        Z3_imag = _mm_add_ps(_mm_mul_ps(Z2_real, V), _mm_mul_ps(Z2_imag, damp));
        Z4_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z3_real, damp), _mm_mul_ps(Z3_imag, V)), A2);
        Z4_imag = _mm_add_ps(_mm_mul_ps(Z3_real, V), _mm_mul_ps(Z3_imag, damp));
        Z5_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z4_real, damp), _mm_mul_ps(Z4_imag, V)), A1);
        Z5_imag = _mm_add_ps(_mm_mul_ps(Z4_real, V), _mm_mul_ps(Z4_imag, damp));
        Z6_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(Z5_real, damp), _mm_mul_ps(Z5_imag, V)), A0);
        Z6_imag = _mm_add_ps(_mm_mul_ps(Z5_real, V), _mm_mul_ps(Z5_imag, damp));
        ZZ1_real = _mm_add_ps(damp, B6);          
        ZZ1_imag = V;                    
        ZZ2_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ1_real, damp), _mm_mul_ps(ZZ1_imag, V)), B5); 
        ZZ2_imag = _mm_add_ps(_mm_mul_ps(ZZ1_real, V), _mm_mul_ps(ZZ1_imag, damp)); 
        ZZ3_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ2_real, damp), _mm_mul_ps(ZZ2_imag, V)), B4); 
        ZZ3_imag = _mm_add_ps(_mm_mul_ps(ZZ2_real, V), _mm_mul_ps(ZZ2_imag, damp)); 
        ZZ4_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ3_real, damp), _mm_mul_ps(ZZ3_imag, V)), B3); 
        ZZ4_imag = _mm_add_ps(_mm_mul_ps(ZZ3_real, V), _mm_mul_ps(ZZ3_imag, damp)); 
        ZZ5_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ4_real, damp), _mm_mul_ps(ZZ4_imag, V)), B2); 
        ZZ5_imag = _mm_add_ps(_mm_mul_ps(ZZ4_real, V), _mm_mul_ps(ZZ4_imag, damp)); 
        ZZ6_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ5_real, damp), _mm_mul_ps(ZZ5_imag, V)), B1); 
        ZZ6_imag = _mm_add_ps(_mm_mul_ps(ZZ5_real, V), _mm_mul_ps(ZZ5_imag, damp)); 
        ZZ7_real = _mm_add_ps(_mm_sub_ps(_mm_mul_ps(ZZ6_real, damp), _mm_mul_ps(ZZ6_imag, V)), B0); 
        ZZ7_imag = _mm_add_ps(_mm_mul_ps(ZZ6_real, V), _mm_mul_ps(ZZ6_imag, damp)); 
        division_factor = _mm_div_ps(one, _mm_add_ps(_mm_mul_ps(ZZ7_real, ZZ7_real), _mm_mul_ps(ZZ7_imag, ZZ7_imag)));
        ZZZ_real = _mm_mul_ps((_mm_add_ps(_mm_mul_ps(Z6_real, ZZ7_real), _mm_mul_ps(Z6_imag, ZZ7_imag))), division_factor); 

        _mm_stream_ps(&voigt_value[i], ZZZ_real);
Пример #20
/* since sin_ps and cos_ps are almost identical, sincos_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos_ps(__m128 x, __m128* s, __m128* c) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf xmm1, xmm2, xmm3 = _mm_setzero_ps(), sign_bit_sin, y;
    v4si emm0, emm2, emm4;
    sign_bit_sin = x;
    /* take the absolute value */
    x = _mm_and_ps(x, constants::inv_sign_mask.ps);
    /* extract the sign bit (upper one) */
    sign_bit_sin = _mm_and_ps(sign_bit_sin, constants::sign_mask.ps);

    /* scale by 4/Pi */
    y = _mm_mul_ps(x, constants::cephes_FOPI.ps);

    /* store the integer part of y in emm2 */
    emm2 = _mm_cvttps_epi32(y);

    /* j=(j+1) & (~1) (see the cephes sources) */
    emm2 = _mm_add_epi32(emm2, constants::pi32_1.pi);
    emm2 = _mm_and_si128(emm2, constants::pi32_inv1.pi);
    y = _mm_cvtepi32_ps(emm2);

    emm4 = emm2;

    /* get the swap sign flag for the sine */
    emm0 = _mm_and_si128(emm2, constants::pi32_4.pi);
    emm0 = _mm_slli_epi32(emm0, 29);
    v4sf swap_sign_bit_sin = _mm_castsi128_ps(emm0);

    /* get the polynom selection mask for the sine*/
    emm2 = _mm_and_si128(emm2, constants::pi32_2.pi);
    emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());
    v4sf poly_mask = _mm_castsi128_ps(emm2);

    /* The magic pass: "******"
    x = ((x - y * DP1) - y * DP2) - y * DP3; */
    xmm1 = constants::minus_cephes_DP1.ps;
    xmm2 = constants::minus_cephes_DP2.ps;
    xmm3 = constants::minus_cephes_DP3.ps;
    xmm1 = _mm_mul_ps(y, xmm1);
    xmm2 = _mm_mul_ps(y, xmm2);
    xmm3 = _mm_mul_ps(y, xmm3);
    x = _mm_add_ps(x, xmm1);
    x = _mm_add_ps(x, xmm2);
    x = _mm_add_ps(x, xmm3);

    emm4 = _mm_sub_epi32(emm4, constants::pi32_2.pi);
    emm4 = _mm_andnot_si128(emm4, constants::pi32_4.pi);
    emm4 = _mm_slli_epi32(emm4, 29);
    v4sf sign_bit_cos = _mm_castsi128_ps(emm4);

    sign_bit_sin = _mm_xor_ps(sign_bit_sin, swap_sign_bit_sin);

    /* Evaluate the first polynom  (0 <= x <= Pi/4) */
    v4sf z = _mm_mul_ps(x,x);
    y = constants::coscof_p0.ps;

    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p1.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, constants::coscof_p2.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_mul_ps(y, z);
    v4sf tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);
    y = _mm_add_ps(y, constants::ps_1.ps);

    /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

    v4sf y2 = constants::sincof_p0.ps;
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p1.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_add_ps(y2, constants::sincof_p2.ps);
    y2 = _mm_mul_ps(y2, z);
    y2 = _mm_mul_ps(y2, x);
    y2 = _mm_add_ps(y2, x);

    /* select the correct result from the two polynoms */
    xmm3 = poly_mask;
    v4sf ysin2 = _mm_and_ps(xmm3, y2);
    v4sf ysin1 = _mm_andnot_ps(xmm3, y);
    y2 = _mm_sub_ps(y2,ysin2);
    y = _mm_sub_ps(y, ysin1);

    xmm1 = _mm_add_ps(ysin1,ysin2);
    xmm2 = _mm_add_ps(y,y2);

    /* update the sign */
    *s = _mm_xor_ps(xmm1, sign_bit_sin);
    *c = _mm_xor_ps(xmm2, sign_bit_cos);
Пример #21
/* motion templates */
CV_IMPL void
cvUpdateMotionHistory( const void* silhouette, void* mhimg,
                       double timestamp, double mhi_duration )
    CvMat  silhstub, *silh = cvGetMat(silhouette, &silhstub);
    CvMat  mhistub, *mhi = cvGetMat(mhimg, &mhistub);

    if( !CV_IS_MASK_ARR( silh ))
        CV_Error( CV_StsBadMask, "" );

    if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 )
        CV_Error( CV_StsUnsupportedFormat, "" );

    if( !CV_ARE_SIZES_EQ( mhi, silh ))
        CV_Error( CV_StsUnmatchedSizes, "" );

    CvSize size = cvGetMatSize( mhi );

    if( CV_IS_MAT_CONT( mhi->type & silh->type ))
        size.width *= size.height;
        size.height = 1;

    float ts = (float)timestamp;
    float delbound = (float)(timestamp - mhi_duration);
    int x, y;
#if CV_SSE2
    volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2);

    for( y = 0; y < size.height; y++ )
        const uchar* silhData = silh->data.ptr + silh->step*y;
        float* mhiData = (float*)(mhi->data.ptr + mhi->step*y);
        x = 0;

#if CV_SSE2
        if( useSIMD )
            __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound);
            for( ; x <= size.width - 8; x += 8 )
                __m128i z = _mm_setzero_si128();
                __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z);
                __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z));
                __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4);
                __m128 fz = _mm_setzero_ps();

                v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4));
                v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4));

                __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz));
                __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz));

                v0 = _mm_xor_ps(v0, m0);
                v1 = _mm_xor_ps(v1, m1);

                _mm_storeu_ps(mhiData + x, v0);
                _mm_storeu_ps(mhiData + x + 4, v1);

        for( ; x < size.width; x++ )
            float val = mhiData[x];
            val = silhData[x] ? ts : val < delbound ? 0 : val;
            mhiData[x] = val;
Пример #22
btVector3 btConvexShape::localGetSupportVertexWithoutMarginNonVirtual (const btVector3& localDir) const
	switch (m_shapeType)
		return btVector3(0,0,0);
		btBoxShape* convexShape = (btBoxShape*)this;
		const btVector3& halfExtents = convexShape->getImplicitShapeDimensions();

#if defined( __APPLE__ ) && (defined( BT_USE_SSE )||defined( BT_USE_NEON ))
    #if defined( BT_USE_SSE )
            return btVector3( _mm_xor_ps( _mm_and_ps( localDir.mVec128, (__m128){-0.0f, -0.0f, -0.0f, -0.0f }), halfExtents.mVec128 ));
    #elif defined( BT_USE_NEON )
            return btVector3( (float32x4_t) (((uint32x4_t) localDir.mVec128 & (uint32x4_t){ 0x80000000, 0x80000000, 0x80000000, 0x80000000}) ^ (uint32x4_t) halfExtents.mVec128 ));
        #error unknown vector arch
		return btVector3(btFsels(localDir.x(), halfExtents.x(), -halfExtents.x()),
			btFsels(localDir.y(), halfExtents.y(), -halfExtents.y()),
			btFsels(localDir.z(), halfExtents.z(), -halfExtents.z()));
		btTriangleShape* triangleShape = (btTriangleShape*)this;
		btVector3 dir(localDir.getX(),localDir.getY(),localDir.getZ());
		btVector3* vertices = &triangleShape->m_vertices1[0];
        btVector3 dots = dir.dot3(vertices[0], vertices[1], vertices[2]);
		btVector3 sup = vertices[dots.maxAxis()];
		return btVector3(sup.getX(),sup.getY(),sup.getZ());
		btCylinderShape* cylShape = (btCylinderShape*)this;
		//mapping of halfextents/dimension onto radius/height depends on how cylinder local orientation is (upAxis)

		btVector3 halfExtents = cylShape->getImplicitShapeDimensions();
		btVector3 v(localDir.getX(),localDir.getY(),localDir.getZ());
		int cylinderUpAxis = cylShape->getUpAxis();
		int XX(1),YY(0),ZZ(2);

		switch (cylinderUpAxis)
		case 0:
			XX = 1;
			YY = 0;
			ZZ = 2;
		case 1:
			XX = 0;
			YY = 1;
			ZZ = 2;	
		case 2:
			XX = 0;
			YY = 2;
			ZZ = 1;

		btScalar radius = halfExtents[XX];
		btScalar halfHeight = halfExtents[cylinderUpAxis];

		btVector3 tmp;
		btScalar d ;

		btScalar s = btSqrt(v[XX] * v[XX] + v[ZZ] * v[ZZ]);
		if (s != btScalar(0.0))
			d = radius / s;  
			tmp[XX] = v[XX] * d;
			tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
			tmp[ZZ] = v[ZZ] * d;
			return btVector3(tmp.getX(),tmp.getY(),tmp.getZ());
		} else {
			tmp[XX] = radius;
			tmp[YY] = v[YY] < 0.0 ? -halfHeight : halfHeight;
			tmp[ZZ] = btScalar(0.0);
			return btVector3(tmp.getX(),tmp.getY(),tmp.getZ());
		btVector3 vec0(localDir.getX(),localDir.getY(),localDir.getZ());

		btCapsuleShape* capsuleShape = (btCapsuleShape*)this;
		btScalar halfHeight = capsuleShape->getHalfHeight();
		int capsuleUpAxis = capsuleShape->getUpAxis();

		btScalar radius = capsuleShape->getRadius();
		btVector3 supVec(0,0,0);

		btScalar maxDot(btScalar(-BT_LARGE_FLOAT));

		btVector3 vec = vec0;
		btScalar lenSqr = vec.length2();
		if (lenSqr < btScalar(0.0001))
		} else
			btScalar rlen = btScalar(1.) / btSqrt(lenSqr );
			vec *= rlen;
		btVector3 vtx;
		btScalar newDot;
			btVector3 pos(0,0,0);
			pos[capsuleUpAxis] = halfHeight;

			//vtx = pos +vec*(radius);
			vtx = pos +vec*(radius) - vec * capsuleShape->getMarginNV();
			newDot = vec.dot(vtx);

			if (newDot > maxDot)
				maxDot = newDot;
				supVec = vtx;
			btVector3 pos(0,0,0);
			pos[capsuleUpAxis] = -halfHeight;

			//vtx = pos +vec*(radius);
			vtx = pos +vec*(radius) - vec * capsuleShape->getMarginNV();
			newDot = vec.dot(vtx);
			if (newDot > maxDot)
				maxDot = newDot;
				supVec = vtx;
		return btVector3(supVec.getX(),supVec.getY(),supVec.getZ());	
		btConvexPointCloudShape* convexPointCloudShape = (btConvexPointCloudShape*)this;
		btVector3* points = convexPointCloudShape->getUnscaledPoints ();
		int numPoints = convexPointCloudShape->getNumPoints ();
		return convexHullSupport (localDir, points, numPoints,convexPointCloudShape->getLocalScalingNV());
		btConvexHullShape* convexHullShape = (btConvexHullShape*)this;
		btVector3* points = convexHullShape->getUnscaledPoints();
		int numPoints = convexHullShape->getNumPoints ();
		return convexHullSupport (localDir, points, numPoints,convexHullShape->getLocalScalingNV());
#ifndef __SPU__
		return this->localGetSupportingVertexWithoutMargin (localDir);
		btAssert (0);

	// should never reach here
	btAssert (0);
	return btVector3 (btScalar(0.0f), btScalar(0.0f), btScalar(0.0f));
Пример #23
void Permutohedral::init ( const MatrixXf & feature )
    // Compute the lattice coordinates for each feature [there is going to be a lot of magic here
    N_ = feature.cols();
    d_ = feature.rows();
    HashTable hash_table( d_, N_/**(d_+1)*/ );

    const int blocksize = sizeof(__m128) / sizeof(float);
    const __m128 invdplus1   = _mm_set1_ps( 1.0f / (d_+1) );
    const __m128 dplus1      = _mm_set1_ps( d_+1 );
    const __m128 Zero        = _mm_set1_ps( 0 );
    const __m128 One         = _mm_set1_ps( 1 );

    // Allocate the class memory
    offset_.resize( (d_+1)*(N_+16) );
    std::fill( offset_.begin(), offset_.end(), 0 );
    barycentric_.resize( (d_+1)*(N_+16) );
    std::fill( barycentric_.begin(), barycentric_.end(), 0 );
    rank_.resize( (d_+1)*(N_+16) );

    // Allocate the local memory
    __m128 * scale_factor = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
    __m128 * f            = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
    __m128 * elevated     = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
    __m128 * rem0         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
    __m128 * rank         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 );
    float * barycentric = new float[(d_+2)*blocksize];
    short * canonical = new short[(d_+1)*(d_+1)];
    short * key = new short[d_+1];

    // Compute the canonical simplex
    for( int i=0; i<=d_; i++ ){
        for( int j=0; j<=d_-i; j++ )
            canonical[i*(d_+1)+j] = i;
        for( int j=d_-i+1; j<=d_; j++ )
            canonical[i*(d_+1)+j] = i - (d_+1);

    // Expected standard deviation of our filter (p.6 in [Adams etal 2010])
    float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1);
    // Compute the diagonal part of E (p.5 in [Adams etal 2010])
    for( int i=0; i<d_; i++ )
        scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( (i+2)*(i+1) ) * inv_std_dev );

    // Setup the SSE rounding
#ifndef __SSE4_1__
    const unsigned int old_rounding = _mm_getcsr();
    _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST );

    // Compute the simplex each feature lies in
    for( int k=0; k<N_; k+=blocksize ){
        // Load the feature from memory
        float * ff = (float*)f;
        for( int j=0; j<d_; j++ )
            for( int i=0; i<blocksize; i++ )
                ff[ j*blocksize + i ] = k+i < N_ ? feature(j,k+i) : 0.0;

        // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010])

        // sm contains the sum of 1..n of our faeture vector
        __m128 sm = Zero;
        for( int j=d_; j>0; j-- ){
            __m128 cf = f[j-1]*scale_factor[j-1];
            elevated[j] = sm - _mm_set1_ps(j)*cf;
            sm += cf;
        elevated[0] = sm;

        // Find the closest 0-colored simplex through rounding
        __m128 sum = Zero;
        for( int i=0; i<=d_; i++ ){
            __m128 v = invdplus1 * elevated[i];
#ifdef __SSE4_1__
            v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT );
            v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) );
            rem0[i] = v*dplus1;
            sum += v;

        // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values)
        for( int i=0; i<=d_; i++ )
            rank[i] = Zero;
        for( int i=0; i<d_; i++ ){
            __m128 di = elevated[i] - rem0[i];
            for( int j=i+1; j<=d_; j++ ){
                __m128 dj = elevated[j] - rem0[j];
                __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) );
                rank[i] += c;
                rank[j] += One-c;

        // If the point doesn't lie on the plane (sum != 0) bring it back
        for( int i=0; i<=d_; i++ ){
            rank[i] += sum;
            __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) );
            __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) );
            rank[i] += add-sub;
            rem0[i] += add-sub;

        // Compute the barycentric coordinates (p.10 in [Adams etal 2010])
        for( int i=0; i<(d_+2)*blocksize; i++ )
            barycentric[ i ] = 0;
        for( int i=0; i<=d_; i++ ){
            __m128 v = (elevated[i] - rem0[i])*invdplus1;

            // Didn't figure out how to SSE this
            float * fv = (float*)&v;
            float * frank = (float*)&rank[i];
            for( int j=0; j<blocksize; j++ ){
                int p = d_-frank[j];
                barycentric[j*(d_+2)+p  ] += fv[j];
                barycentric[j*(d_+2)+p+1] -= fv[j];

        // The rest is not SSE'd
        for( int j=0; j<blocksize; j++ ){
            // Wrap around
            barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1];

            float * frank = (float*)rank;
            float * frem0 = (float*)rem0;
            // Compute all vertices and their offset
            for( int remainder=0; remainder<=d_; remainder++ ){
                for( int i=0; i<d_; i++ ){
                    key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ];
                offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true );
                rank_[ (j+k)*(d_+1)+remainder ] = frank[remainder*blocksize+j];
                barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ];
    _mm_free( scale_factor );
    _mm_free( f );
    _mm_free( elevated );
    _mm_free( rem0 );
    _mm_free( rank );
    delete [] barycentric;
    delete [] canonical;
    delete [] key;

    // Reset the SSE rounding
#ifndef __SSE4_1__
    _mm_setcsr( old_rounding );

    // This is normally fast enough so no SSE needed here
    // Find the Neighbors of each lattice point

    // Get the number of vertices in the lattice
    M_ = hash_table.size();

    // Create the neighborhood structure
    blur_neighbors_.resize( (d_+1)*M_ );

    short * n1 = new short[d_+1];
    short * n2 = new short[d_+1];

    // For each of d+1 axes,
    for( int j = 0; j <= d_; j++ ){
        for( int i=0; i<M_; i++ ){
            const short * key = hash_table.getKey( i );
            for( int k=0; k<d_; k++ ){
                n1[k] = key[k] - 1;
                n2[k] = key[k] + 1;
            n1[j] = key[j] + d_;
            n2[j] = key[j] - d_;

            blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 );
            blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 );
    delete[] n1;
    delete[] n2;
Пример #24
	// ----------------------------------------------------------
	//  Name:   matrix::Inverse
	//  Desc:   Inverse the 4x4 matrix. Matrix is set to inv[M].
	//  Note:   In case of non-inversable matrix, sets the matrix
	//				to a Zero matrix (depending on the switch).
	// ----------------------------------------------------------
	float matrix::Inverse() {
#ifdef _M_IX86
		// The inverse is calculated using "Divide and Conquer" 
		// technique. The original matrix is divide into four
		// 2x2 sub-matrices. Since each register holds four matrix
		// element, the smaller matrices are represented as a
		// registers. Hence we get a better locality of the 
		// calculations.

		// the four sub-matrices
		F32vec4 A = _mm_movelh_ps(_L1, _L2),     
			B = _mm_movehl_ps(_L2, _L1),
			C = _mm_movelh_ps(_L3, _L4),
			D = _mm_movehl_ps(_L4, _L3);
		// partial inverse of the sub-matrices
		F32vec4 iA, iB, iC, iD,	DC, AB;
		// determinant of the sub-matrices
		F32vec1 dA, dB, dC, dD;
		F32vec1 det, d, d1, d2;
		F32vec4 rd;

		//  AB = A# * B
		AB = _mm_mul_ps(_mm_shuffle_ps(A,A,0x0F), B);
		AB -= (F32vec4)_mm_mul_ps(_mm_shuffle_ps(A,A,0xA5), 
		//  DC = D# * C
		DC = _mm_mul_ps(_mm_shuffle_ps(D,D,0x0F), C);
		DC -= (F32vec4)_mm_mul_ps(_mm_shuffle_ps(D,D,0xA5), 

		//  dA = |A|
		dA = _mm_mul_ps(_mm_shuffle_ps(A, A, 0x5F),A);
		dA = _mm_sub_ss(dA, _mm_movehl_ps(dA,dA));
		//  dB = |B|
		dB = _mm_mul_ps(_mm_shuffle_ps(B, B, 0x5F),B);
		dB = _mm_sub_ss(dB, _mm_movehl_ps(dB,dB));

		//  dC = |C|
		dC = _mm_mul_ps(_mm_shuffle_ps(C, C, 0x5F),C);
		dC = _mm_sub_ss(dC, _mm_movehl_ps(dC,dC));
		//  dD = |D|
		dD = _mm_mul_ps(_mm_shuffle_ps(D, D, 0x5F),D);
		dD = _mm_sub_ss(dD, _mm_movehl_ps(dD,dD));

		//  d = trace(AB*DC) = trace(A#*B*D#*C)
		d = _mm_mul_ps(_mm_shuffle_ps(DC,DC,0xD8),AB);

		//  iD = C*A#*B
		iD = _mm_mul_ps(
			_mm_shuffle_ps(C,C,0xA0), _mm_movelh_ps(AB,AB));
		iD += (F32vec4)_mm_mul_ps(
			_mm_shuffle_ps(C,C,0xF5), _mm_movehl_ps(AB,AB));
		//  iA = B*D#*C
		iA = _mm_mul_ps(
			_mm_shuffle_ps(B,B,0xA0), _mm_movelh_ps(DC,DC));
		iA += (F32vec4)_mm_mul_ps(
			_mm_shuffle_ps(B,B,0xF5), _mm_movehl_ps(DC,DC));

		//  d = trace(AB*DC) = trace(A#*B*D#*C) [continue]
		d = _mm_add_ps(d, _mm_movehl_ps(d, d));
		d = _mm_add_ss(d, _mm_shuffle_ps(d, d, 1));
		d1 = dA*dD;
		d2 = dB*dC;

		//  iD = D*|A| - C*A#*B
		iD = D*_mm_shuffle_ps(dA,dA,0) - iD;

		//  iA = A*|D| - B*D#*C;
		iA = A*_mm_shuffle_ps(dD,dD,0) - iA;

		//  det = |A|*|D| + |B|*|C| - trace(A#*B*D#*C)
		det = d1+d2-d;
		rd = (__m128)(F32vec1(1.0f)/det);
		rd = _mm_and_ps(_mm_cmpneq_ss(det,_mm_setzero_ps()), rd);

		//  iB = D * (A#B)# = D*B#*A
		iB = _mm_mul_ps(D, _mm_shuffle_ps(AB,AB,0x33));
		iB -= (F32vec4)_mm_mul_ps(
			_mm_shuffle_ps(D,D,0xB1), _mm_shuffle_ps(AB,AB,0x66));
		//  iC = A * (D#C)# = A*C#*D
		iC = _mm_mul_ps(A, _mm_shuffle_ps(DC,DC,0x33));
		iC -= (F32vec4)_mm_mul_ps(
			_mm_shuffle_ps(A,A,0xB1), _mm_shuffle_ps(DC,DC,0x66));

		rd = _mm_shuffle_ps(rd,rd,0);
		rd ^= Sign_PNNP;

		//  iB = C*|B| - D*B#*A
		iB = C*_mm_shuffle_ps(dB,dB,0) - iB;

		//  iC = B*|C| - A*C#*D;
		iC = B*_mm_shuffle_ps(dC,dC,0) - iC;

		//  iX = iX / det
		iA *= rd;
		iB *= rd;
		iC *= rd;
		iD *= rd;

		_L1 = _mm_shuffle_ps(iA,iB,0x77);
		_L2 = _mm_shuffle_ps(iA,iB,0x22);
		_L3 = _mm_shuffle_ps(iC,iD,0x77);
		_L4 = _mm_shuffle_ps(iC,iD,0x22);

		return *(float*)&det;
		// TODO
		return 0.0f;
#endif // _M_IX86
Пример #25
/* Function:  esl_sse_logf()
 * Synopsis:  <r[z] = log x[z]>
 * Incept:    SRE, Fri Dec 14 11:32:54 2007 [Janelia]
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = logf(x[z])>.
 *            Valid in the domain $x_z > 0$ for normalized IEEE754
 *            $x_z$.
 *            For <x> $< 0$, including -0, returns <NaN>. For <x> $==
 *            0$ or subnormal <x>, returns <-inf>. For <x = inf>,
 *            returns <inf>. For <x = NaN>, returns <NaN>. For 
 *            subnormal <x>, returns <-inf>.
 * Xref:      J2/71.
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2 and added handling
 *            of IEEE754 specials.
esl_sse_logf(__m128 x) 
  static float cephes_p[9] = {  7.0376836292E-2f, -1.1514610310E-1f,  1.1676998740E-1f,
				-1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f,
				2.0000714765E-1f, -2.4999993993E-1f,  3.3333331174E-1f };
  __m128  onev = _mm_set1_ps(1.0f);          /* all elem = 1.0 */
  __m128  v0p5 = _mm_set1_ps(0.5f);          /* all elem = 0.5 */
  __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */
  __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */
  __m128i ei;
  __m128  e;
  __m128  invalid_mask, zero_mask, inf_mask;            /* masks used to handle special IEEE754 inputs */
  __m128  mask;
  __m128  origx;
  __m128  tmp;
  __m128  y;
  __m128  z;

  /* first, split x apart: x = frexpf(x, &e); */
  ei           = _mm_srli_epi32( _mm_castps_si128(x), 23);	                                        /* shift right 23: IEEE754 floats: ei = biased exponents     */
  invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg));  /* mask any elem that's negative; these become NaN           */
  zero_mask    = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128()));                          /* mask any elem zero or subnormal; these become -inf        */
  inf_mask     = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp));  /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN      */
  origx        = x;			                                                                /* store original x, used for log(inf) = inf, log(NaN) = NaN */

  x  = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand        */
  x  = _mm_or_ps (x, v0p5);                                          /* sets hidden bit b[0]                                      */

  ei = _mm_sub_epi32(ei, _mm_set1_epi32(126));                       /* -127 (ei now signed base-2 exponent); then +1             */
  e  = _mm_cvtepi32_ps(ei);

  /* now, calculate the log */
  mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches.           */
  tmp  = _mm_and_ps(x, mask);	                              /* tmp contains x values < 0.707, else 0 */
  x    = _mm_sub_ps(x, onev);
  e    = _mm_sub_ps(e, _mm_and_ps(onev, mask));
  x    = _mm_add_ps(x, tmp);
  z    = _mm_mul_ps(x,x);

  y =               _mm_set1_ps(cephes_p[0]);    y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6]));   y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7]));   y = _mm_mul_ps(y, x);  
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8]));   y = _mm_mul_ps(y, x);
  y = _mm_mul_ps(y, z);

  tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
  y   = _mm_add_ps(y, tmp);

  tmp = _mm_mul_ps(z, v0p5);
  y   = _mm_sub_ps(y, tmp);

  tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
  x = _mm_add_ps(x, y);
  x = _mm_add_ps(x, tmp);

  /* IEEE754 cleanup: */
  x = esl_sse_select_ps(x, origx,                     inf_mask);  /* log(inf)=inf; log(NaN)      = NaN  */
  x = _mm_or_ps(x, invalid_mask);                                 /* log(x<0, including -0,-inf) = NaN  */
  x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal         = -inf */
  return x;
Пример #26
BOOST_FORCEINLINE __m128  __vectorcall operator & ( __m128  const left, __m128  const right ) {
    return _mm_and_ps   ( left, right );
Пример #27
static void SinCos(const float rad, float &sin, float &cos) // #include <emmintrin.h>, #include <xmmintrin.h>
	const __m128 _ps_fopi = _mm_set1_ps(4.0f / pi);

	const __m128 _ps_0p5 = _mm_set1_ps(0.5f);
	const __m128 _ps_1   = _mm_set1_ps(1.0f);

	const __m128 _ps_dp1 = _mm_set1_ps(-0.7851562f);
	const __m128 _ps_dp2 = _mm_set1_ps(-2.4187564849853515625e-4f);
	const __m128 _ps_dp3 = _mm_set1_ps(-3.77489497744594108e-8f);

	const __m128 _ps_sincof_p0 = _mm_set1_ps(2.443315711809948e-5f);
	const __m128 _ps_sincof_p1 = _mm_set1_ps(8.3321608736e-3f);
	const __m128 _ps_sincof_p2 = _mm_set1_ps(-1.6666654611e-1f);
	const __m128 _ps_coscof_p0 = _mm_set1_ps(2.443315711809948e-5f);
	const __m128 _ps_coscof_p1 = _mm_set1_ps(-1.388731625493765e-3f);
	const __m128 _ps_coscof_p2 = _mm_set1_ps(4.166664568298827e-2f);

	const __m128i _pi32_1  = _mm_set1_epi32(1);
	const __m128i _pi32_i1 = _mm_set1_epi32(~1);
	const __m128i _pi32_2  = _mm_set1_epi32(2);
	const __m128i _pi32_4  = _mm_set1_epi32(4);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128  mm1,  mm2;
	__m128i mmi0, mmi2, mmi4;

	__m128 x, y, z;
	__m128 y1,  y2;

	__m128 a = _mm_set1_ps(rad);

	x = _mm_and_ps(a, _mask_sign_inv);
	y = _mm_mul_ps(x, _ps_fopi);

	mmi2 = _mm_cvtps_epi32(y);
	mmi2 = _mm_add_epi32(mmi2, _pi32_1);
	mmi2 = _mm_and_si128(mmi2, _pi32_i1);
	y    = _mm_cvtepi32_ps(mmi2);

	mmi4 = mmi2;

	mmi0 = _mm_and_si128(mmi2, _pi32_4);
	mmi0 = _mm_slli_epi32(mmi0, 29);
	__m128 swap_sign_bit_sin = _mm_castsi128_ps(mmi0);

	mmi2 = _mm_and_si128(mmi2, _pi32_2);
	mmi2 = _mm_cmpeq_epi32(mmi2, _mm_setzero_si128());
	__m128 poly_mask = _mm_castsi128_ps(mmi2);

	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp1));
	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp2));
	x = _mm_add_ps(x, _mm_mul_ps(y, _ps_dp3));

	mmi4 = _mm_sub_epi32(mmi4, _pi32_2);
	mmi4 = _mm_andnot_si128(mmi4, _pi32_4);
	mmi4 = _mm_slli_epi32(mmi4, 29);

	__m128 sign_bit_cos = _mm_castsi128_ps(mmi4);
	__m128 sign_bit_sin = _mm_xor_ps(_mm_and_ps(a, _mask_sign_raw), swap_sign_bit_sin);

	z  = _mm_mul_ps(x, x);

	y1 = _mm_mul_ps(_ps_coscof_p0, z);
	y1 = _mm_add_ps(y1, _ps_coscof_p1);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_add_ps(y1, _ps_coscof_p2);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_mul_ps(y1, z);
	y1 = _mm_sub_ps(y1, _mm_mul_ps(z, _ps_0p5));
	y1 = _mm_add_ps(y1, _ps_1);

	y2 = _mm_mul_ps(_ps_sincof_p0, z);
	y2 = _mm_add_ps(y2, _ps_sincof_p1);
	y2 = _mm_mul_ps(y2, z);
	y2 = _mm_add_ps(y2, _ps_sincof_p2);
	y2 = _mm_mul_ps(y2, z);
	y2 = _mm_mul_ps(y2, x);
	y2 = _mm_add_ps(y2, x);

	__m128 sin1y = _mm_andnot_ps(poly_mask, y1);
	__m128 sin2y = _mm_and_ps(poly_mask, y2);

	mm1 = _mm_add_ps(sin1y, sin2y);
	mm2 = _mm_add_ps(_mm_sub_ps(y1, sin1y), _mm_sub_ps(y2, sin2y));

	sin = _mm_cvtss_f32(_mm_xor_ps(mm1, sign_bit_sin));
	cos = _mm_cvtss_f32(_mm_xor_ps(mm2, sign_bit_cos));
Пример #28
void fDCT2D8x4_and_threshold_keep00_32f(const float* x, float* y, float thresh)
	const int __declspec(align(16)) v32f_absmask[] = { 0x7fffffff, 0x7fffffff, 0x7fffffff, 0x7fffffff };
	const __m128 mth = _mm_set1_ps(thresh);
	const __m128 zeros = _mm_setzero_ps();

	__m128 c0 = _mm_load_ps(x);
	__m128 c1 = _mm_load_ps(x + 56);
	__m128 t0 = _mm_add_ps(c0, c1);
	__m128 t7 = _mm_sub_ps(c0, c1);

	c1 = _mm_load_ps(x + 48);
	c0 = _mm_load_ps(x + 8);
	__m128 t1 = _mm_add_ps(c0, c1);
	__m128 t6 = _mm_sub_ps(c0, c1);

	c1 = _mm_load_ps(x + 40);
	c0 = _mm_load_ps(x + 16);
	__m128 t2 = _mm_add_ps(c0, c1);
	__m128 t5 = _mm_sub_ps(c0, c1);

	c0 = _mm_load_ps(x + 24);
	c1 = _mm_load_ps(x + 32);
	__m128 t3 = _mm_add_ps(c0, c1);
	__m128 t4 = _mm_sub_ps(c0, c1);

	c1 = x[0]; c2 = x[7]; t0 = c1 + c2; t7 = c1 - c2;
	c1 = x[1]; c2 = x[6]; t1 = c1 + c2; t6 = c1 - c2;
	c1 = x[2]; c2 = x[5]; t2 = c1 + c2; t5 = c1 - c2;
	c1 = x[3]; c2 = x[4]; t3 = c1 + c2; t4 = c1 - c2;

	c0 = _mm_add_ps(t0, t3);
	__m128 c3 = _mm_sub_ps(t0, t3);
	c1 = _mm_add_ps(t1, t2);
	__m128 c2 = _mm_sub_ps(t1, t2);

	c0 = t0 + t3; c3 = t0 - t3;
	c1 = t1 + t2; c2 = t1 - t2;

	const __m128 invsqrt2h = _mm_set_ps1(0.353554f);

	__m128 v = _mm_mul_ps(_mm_add_ps(c0, c1), invsqrt2h);
	__m128 msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	// keep 00 coef.
	__m128 v2 = _mm_blendv_ps(zeros, v, msk);
	v2 = _mm_blend_ps(v2, v, 1);
	_mm_store_ps(y, v2);

	v = _mm_mul_ps(_mm_sub_ps(c0, c1), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 32, v);

	/*y[0] = c0 + c1;
	y[4] = c0 - c1;*/

	__m128 w0 = _mm_set_ps1(0.541196f);
	__m128 w1 = _mm_set_ps1(1.306563f);
	v = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(w0, c2), _mm_mul_ps(w1, c3)), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 16, v);

	v = _mm_mul_ps(_mm_sub_ps(_mm_mul_ps(w0, c3), _mm_mul_ps(w1, c2)), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 48, v);
	y[2] = c2 * r[6] + c3 * r[2];
	y[6] = c3 * r[6] - c2 * r[2];

	w0 = _mm_set_ps1(1.175876f);
	w1 = _mm_set_ps1(0.785695f);
	c3 = _mm_add_ps(_mm_mul_ps(w0, t4), _mm_mul_ps(w1, t7));
	c0 = _mm_sub_ps(_mm_mul_ps(w0, t7), _mm_mul_ps(w1, t4));
	c3 = t4 * r[3] + t7 * r[5];
	c0 = t7 * r[3] - t4 * r[5];

	w0 = _mm_set_ps1(1.387040f);
	w1 = _mm_set_ps1(0.275899f);
	c2 = _mm_add_ps(_mm_mul_ps(w0, t5), _mm_mul_ps(w1, t6));
	c1 = _mm_sub_ps(_mm_mul_ps(w0, t6), _mm_mul_ps(w1, t5));
	c2 = t5 * r[1] + t6 * r[7];
	c1 = t6 * r[1] - t5 * r[7];

	v = _mm_mul_ps(_mm_sub_ps(c0, c2), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);

	_mm_store_ps(y + 24, v);

	v = _mm_mul_ps(_mm_sub_ps(c3, c1), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 40, v);
	//y[5] = c3 - c1; y[3] = c0 - c2;

	const __m128 invsqrt2 = _mm_set_ps1(0.707107f);
	c0 = _mm_mul_ps(_mm_add_ps(c0, c2), invsqrt2);
	c3 = _mm_mul_ps(_mm_add_ps(c3, c1), invsqrt2);
	//c0 = (c0 + c2) * invsqrt2;
	//c3 = (c3 + c1) * invsqrt2;

	v = _mm_mul_ps(_mm_add_ps(c0, c3), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);
	_mm_store_ps(y + 8, v);

	v = _mm_mul_ps(_mm_sub_ps(c0, c3), invsqrt2h);
	msk = _mm_cmpgt_ps(_mm_and_ps(v, *(const __m128*)v32f_absmask), mth);
	v = _mm_blendv_ps(zeros, v, msk);

	_mm_store_ps(y + 56, v);
	//y[1] = c0 + c3; y[7] = c0 - c3;

	/*for(i = 0;i < 8;i++)
	y[i] *= invsqrt2h;
Пример #29
/* evaluation of 4 sines at onces, using only SSE1+MMX intrinsics so
it runs also on old athlons XPs and the pentium III of your grand

The code is the exact rewriting of the cephes sinf function.
Precision is excellent as long as x < 8192 (I did not bother to
take into account the special handling they have for greater values
-- it does not return garbage for arguments over 8192, though, but
the extra precision is missing).

Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
surprising but correct result.

Performance is also surprisingly good, 1.33 times faster than the
macos vsinf SSE2 function, and 1.5 times faster than the
__vrs4_sinf of amd's ACML (which is only available in 64 bits). Not
too bad for an SSE1 function (with no special tuning) !
However the latter libraries probably have a much better handling of NaN,
Inf, denormalized and other special arguments..

On my core 1 duo, the execution of this function takes approximately 95 cycles.

From what I have observed on the experiments with Intel AMath lib, switching to an
SSE2 version would improve the perf by only 10%.

Since it is based on SSE intrinsics, it has to be compiled at -O2 to
deliver full speed.
__m128 sin_ps(v4sfu *xPtr) { // any x
   __m128 x=*((__m128 *)xPtr);
   __m128 xmm1, xmm2 = _mm_setzero_ps(), xmm3, sign_bit, y;

#ifdef USE_SSE2
   __m128i emm0, emm2;
   __m64 mm0, mm1, mm2, mm3;
   sign_bit = x;
   /* take the absolute value */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_sign_mask);
   /* extract the sign bit (upper one) */
   sign_bit = _mm_and_ps(sign_bit, *(__m128*)_ps_sign_mask);

   /* scale by 4/Pi */
   y = _mm_mul_ps(x, *(__m128*)_ps_cephes_FOPI);

#ifdef USE_SSE2
   /* store the integer part of y in mm0 */
   emm2 = _mm_cvttps_epi32(y);
   /* j=(j+1) & (~1) (see the cephes sources) */
   emm2 = _mm_add_epi32(emm2, *(__m128i*)_pi32_1);
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_inv1);
   y = _mm_cvtepi32_ps(emm2);

   /* get the swap sign flag */
   emm0 = _mm_and_si128(emm2, *(__m128i*)_pi32_4);
   emm0 = _mm_slli_epi32(emm0, 29);
   /* get the polynom selection mask 
   there is one polynom for 0 <= x <= Pi/4
   and another one for Pi/4<x<=Pi/2

   Both branches will be computed.
   emm2 = _mm_and_si128(emm2, *(__m128i*)_pi32_2);
   emm2 = _mm_cmpeq_epi32(emm2, _mm_setzero_si128());

   __m128 swap_sign_bit = _mm_castsi128_ps(emm0);
   __m128 poly_mask = _mm_castsi128_ps(emm2);
   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);

   /* store the integer part of y in mm0:mm1 */
   xmm2 = _mm_movehl_ps(xmm2, y);
   mm2 = _mm_cvttps_pi32(y);
   mm3 = _mm_cvttps_pi32(xmm2);
   /* j=(j+1) & (~1) (see the cephes sources) */
   mm2 = _mm_add_pi32(mm2, *(__m64*)_pi32_1);
   mm3 = _mm_add_pi32(mm3, *(__m64*)_pi32_1);
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_inv1);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_inv1);
   y = _mm_cvtpi32x2_ps(mm2, mm3);
   /* get the swap sign flag */
   mm0 = _mm_and_si64(mm2, *(__m64*)_pi32_4);
   mm1 = _mm_and_si64(mm3, *(__m64*)_pi32_4);
   mm0 = _mm_slli_pi32(mm0, 29);
   mm1 = _mm_slli_pi32(mm1, 29);
   /* get the polynom selection mask */
   mm2 = _mm_and_si64(mm2, *(__m64*)_pi32_2);
   mm3 = _mm_and_si64(mm3, *(__m64*)_pi32_2);
   mm2 = _mm_cmpeq_pi32(mm2, _mm_setzero_si64());
   mm3 = _mm_cmpeq_pi32(mm3, _mm_setzero_si64());
   __m128 swap_sign_bit, poly_mask;
   COPY_MM_TO_XMM(mm0, mm1, swap_sign_bit);
   COPY_MM_TO_XMM(mm2, mm3, poly_mask);
   sign_bit = _mm_xor_ps(sign_bit, swap_sign_bit);
   _mm_empty(); /* good-bye mmx */

   /* The magic pass: "******" 
   x = ((x - y * DP1) - y * DP2) - y * DP3; */
   xmm1 = *(__m128*)_ps_minus_cephes_DP1;
   xmm2 = *(__m128*)_ps_minus_cephes_DP2;
   xmm3 = *(__m128*)_ps_minus_cephes_DP3;
   xmm1 = _mm_mul_ps(y, xmm1);
   xmm2 = _mm_mul_ps(y, xmm2);
   xmm3 = _mm_mul_ps(y, xmm3);
   x = _mm_add_ps(x, xmm1);
   x = _mm_add_ps(x, xmm2);
   x = _mm_add_ps(x, xmm3);

   /* Evaluate the first polynom  (0 <= x <= Pi/4) */
   y = *(__m128*)_ps_coscof_p0;
   __m128 z = _mm_mul_ps(x,x);

   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p1);
   y = _mm_mul_ps(y, z);
   y = _mm_add_ps(y, *(__m128*)_ps_coscof_p2);
   y = _mm_mul_ps(y, z);
   y = _mm_mul_ps(y, z);
   __m128 tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);
   y = _mm_add_ps(y, *(__m128*)_ps_1);

   /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

   __m128 y2 = *(__m128*)_ps_sincof_p0;
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p1);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_add_ps(y2, *(__m128*)_ps_sincof_p2);
   y2 = _mm_mul_ps(y2, z);
   y2 = _mm_mul_ps(y2, x);
   y2 = _mm_add_ps(y2, x);

   /* select the correct result from the two polynoms */  
   xmm3 = poly_mask;
   y2 = _mm_and_ps(xmm3, y2); //, xmm3);
   y = _mm_andnot_ps(xmm3, y);
   y = _mm_add_ps(y,y2);
   /* update the sign */
   y = _mm_xor_ps(y, sign_bit);
   return y;
Пример #30
static void compute_step_tv2_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], float alpha, unsigned x, unsigned y, double *tv2) {
        __m128 g_xxs[3] = {0};
        __m128 g_xy_syms[3] = {0};
        __m128 g_yys[3] = {0};

        const __m128 mtwo = _mm_set_ps1(2.);
        const __m128 minf = _mm_set_ps1(INFINITY);
        const __m128 mzero = _mm_set_ps1(0.);

        __m128 malpha = _mm_set_ps1(alpha * 1./sqrtf(nchannel));

        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];

                __m128 g_x = _mm_load_ps(p(aux->temp[0], x, y, w, h));
                __m128 g_y = _mm_load_ps(p(aux->temp[1], x, y, w, h));

                // backward x
                g_xxs[c] = g_x - _mm_loadu_ps(p(aux->temp[0], x-1, y, w, h));
                // backward x
                __m128 g_yx = g_y - _mm_loadu_ps(p(aux->temp[1], x-1, y, w, h));
                // backward y
                __m128 g_xy = g_x - _mm_load_ps(p(aux->temp[0], x, y-1, w, h));
                // backward y
                g_yys[c] = g_y - _mm_load_ps(p(aux->temp[1], x, y-1, w, h));
                // symmetrize
                g_xy_syms[c] = (g_xy + g_yx) / mtwo;

        // norm
        __m128 g2_norm = mzero;
        for(unsigned c = 0; c < nchannel; c++) {
                g2_norm += SQR(g_xxs[c]) + mtwo * SQR(g_xy_syms[c]) + SQR(g_yys[c]);
        g2_norm = _mm_sqrt_ps(g2_norm);

        __m128 alpha_norm = malpha * g2_norm;
        *tv2 += alpha_norm[0];
        *tv2 += alpha_norm[1];
        *tv2 += alpha_norm[2];
        *tv2 += alpha_norm[3];

        // set zeroes to infinity
        g2_norm = _mm_or_ps(g2_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g2_norm, mzero)));

        for(unsigned c = 0; c < nchannel; c++) {
                __m128 g_xx = g_xxs[c];
                __m128 g_yy = g_yys[c];
                __m128 g_xy_sym = g_xy_syms[c];
                struct aux *aux = &auxs[c];

                // N.B. for same exact result as the c version,
                // we must calculate the objective gradient from right to left
                        float *pobj_ur = p(aux->obj_gradient, x+1, y-1, w, h);
                        __m128 obj_ur = _mm_loadu_ps(pobj_ur);
                        obj_ur += malpha * ((-g_xy_sym) / g2_norm);
                        _mm_storeu_ps(pobj_ur, obj_ur);

                        float *pobj_r = p(aux->obj_gradient, x+1, y, w, h);
                        __m128 obj_r = _mm_loadu_ps(pobj_r);
                        obj_r += malpha * ((g_xy_sym + g_xx) / g2_norm);
                        _mm_storeu_ps(pobj_r, obj_r);

                        float *pobj_u = p(aux->obj_gradient, x, y-1, w, h);
                        __m128 obj_u = _mm_load_ps(pobj_u);
                        obj_u += malpha * ((g_yy + g_xy_sym) / g2_norm);
                        _mm_store_ps(pobj_u, obj_u);

                        float *pobj = p(aux->obj_gradient, x, y, w, h);
                        __m128 obj = _mm_load_ps(pobj);
                        obj += malpha * (-(mtwo * g_xx + mtwo * g_xy_sym + mtwo * g_yy) / g2_norm);
                        _mm_store_ps(pobj, obj);

                        float *pobj_b = p(aux->obj_gradient, x, y+1, w, h);
                        __m128 obj_b = _mm_load_ps(pobj_b);
                        obj_b += malpha * ((g_yy + g_xy_sym) / g2_norm);
                        _mm_store_ps(pobj_b, obj_b);

                        float *pobj_l = p(aux->obj_gradient, x-1, y, w, h);
                        __m128 obj_l = _mm_loadu_ps(pobj_l);
                        obj_l += malpha * ((g_xy_sym + g_xx) / g2_norm);
                        _mm_storeu_ps(pobj_l, obj_l);

                        float *pobj_lb = p(aux->obj_gradient, x-1, y+1, w, h);
                        __m128 obj_lb = _mm_loadu_ps(pobj_lb);
                        obj_lb += malpha * ((-g_xy_sym) / g2_norm);
                        _mm_storeu_ps(pobj_lb, obj_lb);