コード例 #1
0
/* natural logarithm computed for 4 simultaneous float 
return NaN for x <= 0
*/
__m128 log_ps(v4sfu *xPtr) {
   __m128 x=*((__m128 *)xPtr);
#ifdef USE_SSE2
   __m128i emm0;
#else
   __m64 mm0, mm1;
#endif
   __m128 one = *(__m128*)_ps_1;

   __m128 invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

   x = _mm_max_ps(x, *(__m128*)_ps_min_norm_pos);  /* cut off denormalized stuff */

#ifndef USE_SSE2
   /* part 1: x = frexpf(x, &e); */
   COPY_XMM_TO_MM(x, mm0, mm1);
   mm0 = _mm_srli_pi32(mm0, 23);
   mm1 = _mm_srli_pi32(mm1, 23);
#else
   emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
#endif
   /* keep only the fractional part */
   x = _mm_and_ps(x, *(__m128*)_ps_inv_mant_mask);
   x = _mm_or_ps(x, *(__m128*)_ps_0p5);

#ifndef USE_SSE2
   /* now e=mm0:mm1 contain the really base-2 exponent */
   mm0 = _mm_sub_pi32(mm0, *(__m64*)_pi32_0x7f);
   mm1 = _mm_sub_pi32(mm1, *(__m64*)_pi32_0x7f);
   __m128 e = _mm_cvtpi32x2_ps(mm0, mm1);
   _mm_empty(); /* bye bye mmx */
#else
   emm0 = _mm_sub_epi32(emm0, *(__m128i*)_pi32_0x7f);
   __m128 e = _mm_cvtepi32_ps(emm0);
#endif

   e = _mm_add_ps(e, one);

   /* part2: 
   if( x < SQRTHF ) {
   e -= 1;
   x = x + x - 1.0;
   } else { x = x - 1.0; }
   */
   __m128 mask = _mm_cmplt_ps(x, *(__m128*)_ps_cephes_SQRTHF);
   __m128 tmp = _mm_and_ps(x, mask);
   x = _mm_sub_ps(x, one);
   e = _mm_sub_ps(e, _mm_and_ps(one, mask));
   x = _mm_add_ps(x, tmp);


   __m128 z = _mm_mul_ps(x,x);

   __m128 y = *(__m128*)_ps_cephes_log_p0;
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p1);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p2);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p3);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p4);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p5);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p6);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p7);
   y = _mm_mul_ps(y, x);
   y = _mm_add_ps(y, *(__m128*)_ps_cephes_log_p8);
   y = _mm_mul_ps(y, x);

   y = _mm_mul_ps(y, z);


   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q1);
   y = _mm_add_ps(y, tmp);


   tmp = _mm_mul_ps(z, *(__m128*)_ps_0p5);
   y = _mm_sub_ps(y, tmp);

   tmp = _mm_mul_ps(e, *(__m128*)_ps_cephes_log_q2);
   x = _mm_add_ps(x, y);
   x = _mm_add_ps(x, tmp);
   x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
   return x;
}
コード例 #2
0
ファイル: sse.hpp プロジェクト: berak/opencv_smallfry
RETf OR( const __m128 x, const __m128 y ) { return _mm_or_ps(x,y); }
コード例 #3
0
ファイル: main.cpp プロジェクト: minh0722/HPC2015
inline static void bar(float(&inout)[8])
{
	__m128 leftSideElements[6],
		rightSideElements[6],
		leftGERight[6],
		leftLTRight[6],
		leftElementsGE[6],  // swaped elements on the left part of comparison
		leftElementsLT[6],  // not-swaped elements on the left part of comparison
		rightElementsGE[6], // swaped elements on the right part of comparison
		rightElementsLT[6]; // not-swaped elements on the right part of comparison
	float resultLeftElements[6][4], resultRightElements[6][4];

	const size_t idx[][2] = {
			{ 0, 1 }, { 3, 2 }, { 4, 5 }, { 7, 6 },
			{ 0, 2 }, { 1, 3 }, { 6, 4 }, { 7, 5 },
			{ 0, 1 }, { 2, 3 }, { 5, 4 }, { 7, 6 },
			{ 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 },
			{ 0, 2 }, { 1, 3 }, { 4, 6 }, { 5, 7 },
			{ 0, 1 }, { 2, 3 }, { 4, 5 }, { 6, 7 }
	};

	// First row
	leftSideElements[0] = _mm_set_ps(inout[idx[3][0]], inout[idx[2][0]], inout[idx[1][0]], inout[idx[0][0]]);
	rightSideElements[0] = _mm_set_ps(inout[idx[3][1]], inout[idx[2][1]], inout[idx[1][1]], inout[idx[0][1]]);

	leftGERight[0] = _mm_cmpge_ps(leftSideElements[0], rightSideElements[0]); // Something like 0 0 -1 -1.
	leftLTRight[0] = _mm_cmplt_ps(leftSideElements[0], rightSideElements[0]); // Something like -1 -1 0 0.

	// Calculates the values of the elements on the left.
	leftElementsGE[0] = _mm_and_ps(rightSideElements[0], leftGERight[0]); // If the element on left side is bigger or equal to the element on the right side - swaps, so writes the element on the left side to be the element on the right.
	leftElementsLT[0] = _mm_and_ps(leftSideElements[0], leftLTRight[0]);  // If the element on the left side is less than element on the right side - don`t swap and writes the element on left side on it`s place.

	// Calculates the values of the elements on the right
	rightElementsGE[0] = _mm_and_ps(leftSideElements[0], leftGERight[0]);  // If the element on the left side is bigger or equal to the element on the right side - swaps, so writes on the element on the right side to be the element on the left.
	rightElementsLT[0] = _mm_and_ps(rightSideElements[0], leftLTRight[0]); // If the element on the left side is less than element on the right side - don`t swap and writes the element on the right side on it`s place.

	// Now let`s combine the elements, because we have two vectors @leftGERight and @leftLTRight, which are basically inverted, so one OR operation will do it.
	// (in the @leftElemetnsGE will have something like [0, 0, element, element] and in the @leftElemetnsLT will be [element, element, 0, 0]) 
	leftSideElements[0] = _mm_or_ps(leftElementsGE[0], leftElementsLT[0]);
	rightSideElements[0] = _mm_or_ps(rightElementsGE[0], rightElementsLT[0]);

	// Now let`s write them in our array so we can put them in their original places on the given @inout.
	_mm_storeu_ps(resultLeftElements[0], leftSideElements[0]);
	_mm_storeu_ps(resultRightElements[0], rightSideElements[0]);

	// Puts the swaped(if needed) elements on their places.
	inout[idx[0][0]] = resultLeftElements[0][0];
	inout[idx[0][1]] = resultRightElements[0][0];
	inout[idx[1][0]] = resultLeftElements[0][1];
	inout[idx[1][1]] = resultRightElements[0][1];
	inout[idx[2][0]] = resultLeftElements[0][2];
	inout[idx[2][1]] = resultRightElements[0][2];
	inout[idx[3][0]] = resultLeftElements[0][3];
	inout[idx[3][1]] = resultRightElements[0][3];

	// Second row
	leftSideElements[1] = _mm_set_ps(inout[idx[7][0]], inout[idx[6][0]], inout[idx[5][0]], inout[idx[4][0]]);
	rightSideElements[1] = _mm_set_ps(inout[idx[7][1]], inout[idx[6][1]], inout[idx[5][1]], inout[idx[4][1]]);

	leftGERight[1] = _mm_cmpge_ps(leftSideElements[1], rightSideElements[1]);
	leftLTRight[1] = _mm_cmplt_ps(leftSideElements[1], rightSideElements[1]);

	leftElementsGE[1] = _mm_and_ps(rightSideElements[1], leftGERight[1]);
	leftElementsLT[1] = _mm_and_ps(leftSideElements[1], leftLTRight[1]);

	rightElementsGE[1] = _mm_and_ps(leftSideElements[1], leftGERight[1]);
	rightElementsLT[1] = _mm_and_ps(rightSideElements[1], leftLTRight[1]);

	leftSideElements[1] = _mm_or_ps(leftElementsGE[1], leftElementsLT[1]);
	rightSideElements[1] = _mm_or_ps(rightElementsGE[1], rightElementsLT[1]);

	_mm_storeu_ps(resultLeftElements[1], leftSideElements[1]);
	_mm_storeu_ps(resultRightElements[1], rightSideElements[1]);

	inout[idx[4][0]] = resultLeftElements[1][0];
	inout[idx[4][1]] = resultRightElements[1][0];
	inout[idx[5][0]] = resultLeftElements[1][1];
	inout[idx[5][1]] = resultRightElements[1][1];
	inout[idx[6][0]] = resultLeftElements[1][2];
	inout[idx[6][1]] = resultRightElements[1][2];
	inout[idx[7][0]] = resultLeftElements[1][3];
	inout[idx[7][1]] = resultRightElements[1][3];

	// Third row
	leftSideElements[2] = _mm_set_ps(inout[idx[11][0]], inout[idx[10][0]], inout[idx[9][0]], inout[idx[8][0]]);
	rightSideElements[2] = _mm_set_ps(inout[idx[11][1]], inout[idx[10][1]], inout[idx[9][1]], inout[idx[8][1]]);

	leftGERight[2] = _mm_cmpge_ps(leftSideElements[2], rightSideElements[2]);
	leftLTRight[2] = _mm_cmplt_ps(leftSideElements[2], rightSideElements[2]);

	leftElementsGE[2] = _mm_and_ps(rightSideElements[2], leftGERight[2]);
	leftElementsLT[2] = _mm_and_ps(leftSideElements[2], leftLTRight[2]);

	rightElementsGE[2] = _mm_and_ps(leftSideElements[2], leftGERight[2]);
	rightElementsLT[2] = _mm_and_ps(rightSideElements[2], leftLTRight[2]);

	leftSideElements[2] = _mm_or_ps(leftElementsGE[2], leftElementsLT[2]);
	rightSideElements[2] = _mm_or_ps(rightElementsGE[2], rightElementsLT[2]);

	_mm_storeu_ps(resultLeftElements[2], leftSideElements[2]);
	_mm_storeu_ps(resultRightElements[2], rightSideElements[2]);

	inout[idx[8][0]] = resultLeftElements[2][0];
	inout[idx[8][1]] = resultRightElements[2][0];
	inout[idx[9][0]] = resultLeftElements[2][1];
	inout[idx[9][1]] = resultRightElements[2][1];
	inout[idx[10][0]] = resultLeftElements[2][2];
	inout[idx[10][1]] = resultRightElements[2][2];
	inout[idx[11][0]] = resultLeftElements[2][3];
	inout[idx[11][1]] = resultRightElements[2][3];

	// Fourth row
	leftSideElements[3] = _mm_set_ps(inout[idx[15][0]], inout[idx[14][0]], inout[idx[13][0]], inout[idx[12][0]]);
	rightSideElements[3] = _mm_set_ps(inout[idx[15][1]], inout[idx[14][1]], inout[idx[13][1]], inout[idx[12][1]]);

	leftGERight[3] = _mm_cmpge_ps(leftSideElements[3], rightSideElements[3]);
	leftLTRight[3] = _mm_cmplt_ps(leftSideElements[3], rightSideElements[3]);

	leftElementsGE[3] = _mm_and_ps(rightSideElements[3], leftGERight[3]);
	leftElementsLT[3] = _mm_and_ps(leftSideElements[3], leftLTRight[3]);

	rightElementsGE[3] = _mm_and_ps(leftSideElements[3], leftGERight[3]);
	rightElementsLT[3] = _mm_and_ps(rightSideElements[3], leftLTRight[3]);

	leftSideElements[3] = _mm_or_ps(leftElementsGE[3], leftElementsLT[3]);
	rightSideElements[3] = _mm_or_ps(rightElementsGE[3], rightElementsLT[3]);

	_mm_storeu_ps(resultLeftElements[3], leftSideElements[3]);
	_mm_storeu_ps(resultRightElements[3], rightSideElements[3]);

	inout[idx[12][0]] = resultLeftElements[3][0];
	inout[idx[12][1]] = resultRightElements[3][0];
	inout[idx[13][0]] = resultLeftElements[3][1];
	inout[idx[13][1]] = resultRightElements[3][1];
	inout[idx[14][0]] = resultLeftElements[3][2];
	inout[idx[14][1]] = resultRightElements[3][2];
	inout[idx[15][0]] = resultLeftElements[3][3];
	inout[idx[15][1]] = resultRightElements[3][3];

	// Fifth row
	leftSideElements[4] = _mm_set_ps(inout[idx[19][0]], inout[idx[18][0]], inout[idx[17][0]], inout[idx[16][0]]);
	rightSideElements[4] = _mm_set_ps(inout[idx[19][1]], inout[idx[18][1]], inout[idx[17][1]], inout[idx[16][1]]);

	leftGERight[4] = _mm_cmpge_ps(leftSideElements[4], rightSideElements[4]);
	leftLTRight[4] = _mm_cmplt_ps(leftSideElements[4], rightSideElements[4]);

	leftElementsGE[4] = _mm_and_ps(rightSideElements[4], leftGERight[4]);
	leftElementsLT[4] = _mm_and_ps(leftSideElements[4], leftLTRight[4]);

	rightElementsGE[4] = _mm_and_ps(leftSideElements[4], leftGERight[4]);
	rightElementsLT[4] = _mm_and_ps(rightSideElements[4], leftLTRight[4]);

	leftSideElements[4] = _mm_or_ps(leftElementsGE[4], leftElementsLT[4]);
	rightSideElements[4] = _mm_or_ps(rightElementsGE[4], rightElementsLT[4]);

	_mm_storeu_ps(resultLeftElements[4], leftSideElements[4]);
	_mm_storeu_ps(resultRightElements[4], rightSideElements[4]);

	inout[idx[16][0]] = resultLeftElements[4][0];
	inout[idx[16][1]] = resultRightElements[4][0];
	inout[idx[17][0]] = resultLeftElements[4][1];
	inout[idx[17][1]] = resultRightElements[4][1];
	inout[idx[18][0]] = resultLeftElements[4][2];
	inout[idx[18][1]] = resultRightElements[4][2];
	inout[idx[19][0]] = resultLeftElements[4][3];
	inout[idx[19][1]] = resultRightElements[4][3];

	// Sixth row
	leftSideElements[5] = _mm_set_ps(inout[idx[23][0]], inout[idx[22][0]], inout[idx[21][0]], inout[idx[20][0]]);
	rightSideElements[5] = _mm_set_ps(inout[idx[23][1]], inout[idx[22][1]], inout[idx[21][1]], inout[idx[20][1]]);

	leftGERight[5] = _mm_cmpge_ps(leftSideElements[5], rightSideElements[5]);
	leftLTRight[5] = _mm_cmplt_ps(leftSideElements[5], rightSideElements[5]);

	leftElementsGE[5] = _mm_and_ps(rightSideElements[5], leftGERight[5]);
	leftElementsLT[5] = _mm_and_ps(leftSideElements[5], leftLTRight[5]);

	rightElementsGE[5] = _mm_and_ps(leftSideElements[5], leftGERight[5]);
	rightElementsLT[5] = _mm_and_ps(rightSideElements[5], leftLTRight[5]);

	leftSideElements[5] = _mm_or_ps(leftElementsGE[5], leftElementsLT[5]);
	rightSideElements[5] = _mm_or_ps(rightElementsGE[5], rightElementsLT[5]);

	_mm_storeu_ps(resultLeftElements[5], leftSideElements[5]);
	_mm_storeu_ps(resultRightElements[5], rightSideElements[5]);

	inout[idx[20][0]] = resultLeftElements[5][0];
	inout[idx[20][1]] = resultRightElements[5][0];
	inout[idx[21][0]] = resultLeftElements[5][1];
	inout[idx[21][1]] = resultRightElements[5][1];
	inout[idx[22][0]] = resultLeftElements[5][2];
	inout[idx[22][1]] = resultRightElements[5][2];
	inout[idx[23][0]] = resultLeftElements[5][3];
	inout[idx[23][1]] = resultRightElements[5][3];
}
コード例 #4
0
ファイル: colorout.c プロジェクト: cherrot/darktable
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
    //fprintf(stderr,"Using cmatrix codepath\n");
    // convert to rgb using matrix
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + (size_t)ch*roi_in->width *j;
      float *out = (float*)ovoid + (size_t)ch*roi_out->width*j;
      const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]);

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2)))));

        _mm_stream_ps(out,t);
      }
    }
    _mm_sfence();
    // apply profile
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + (size_t)ch*roi_in->width *j;
      float *out = (float*)ovoid + (size_t)ch*roi_out->width*j;

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        for(int i=0; i<3; i++)
          if (d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    //fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out)
#endif
    for (int k=0; k<roi_out->height; k++)
    {
      const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

      if(!gamutcheck)
      {
        cmsDoTransform(d->xform, in, out, roi_out->width);
      } else {
        void *rgb = dt_alloc_align(16, 4*sizeof(float)*roi_out->width);
        cmsDoTransform(d->xform, in, rgb, roi_out->width);
        float *rgbptr = (float *)rgb;
        for (int j=0; j<roi_out->width; j++,rgbptr+=4,out+=4)
        {
          const __m128 pixel = _mm_load_ps(rgbptr);
          const __m128 ingamut = _mm_cmpge_ps(pixel, _mm_setzero_ps());
          const __m128 result = _mm_or_ps(_mm_andnot_ps(ingamut, outofgamutpixel),
                                          _mm_and_ps(ingamut, pixel));
          _mm_stream_ps(out, result);
        }
        dt_free_align(rgb);
      }
    }
    _mm_sfence();
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
コード例 #5
0
ファイル: colorout.c プロジェクト: LViatour/darktable
void process_sse2(struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void *const ivoid,
                  void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->mode == DT_PROFILE_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
// fprintf(stderr,"Using cmatrix codepath\n");
// convert to rgb using matrix
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {

      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;
      const __m128 m0 = _mm_set_ps(0.0f, d->cmatrix[6], d->cmatrix[3], d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f, d->cmatrix[7], d->cmatrix[4], d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f, d->cmatrix[8], d->cmatrix[5], d->cmatrix[2]);

      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t
            = _mm_add_ps(_mm_mul_ps(m0, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(0, 0, 0, 0))),
                         _mm_add_ps(_mm_mul_ps(m1, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(1, 1, 1, 1))),
                                    _mm_mul_ps(m2, _mm_shuffle_ps(xyz, xyz, _MM_SHUFFLE(2, 2, 2, 2)))));

        _mm_stream_ps(out, t);
      }
    }
    _mm_sfence();

    process_fastpath_apply_tonecurves(self, piece, ivoid, ovoid, roi_in, roi_out);
  }
  else
  {
    // fprintf(stderr,"Using xform codepath\n");
    const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f);
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
    for(int k = 0; k < roi_out->height; k++)
    {
      const float *in = ((float *)ivoid) + (size_t)ch * k * roi_out->width;
      float *out = ((float *)ovoid) + (size_t)ch * k * roi_out->width;

      cmsDoTransform(d->xform, in, out, roi_out->width);

      if(gamutcheck)
      {
        for(int j = 0; j < roi_out->width; j++, out += 4)
        {
          const __m128 pixel = _mm_load_ps(out);
          __m128 ingamut = _mm_cmplt_ps(pixel, _mm_set_ps(-FLT_MAX, 0.0f, 0.0f, 0.0f));

          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));
          ingamut = _mm_or_ps(_mm_unpacklo_ps(ingamut, ingamut), _mm_unpackhi_ps(ingamut, ingamut));

          const __m128 result
              = _mm_or_ps(_mm_and_ps(ingamut, outofgamutpixel), _mm_andnot_ps(ingamut, pixel));
          _mm_stream_ps(out, result);
        }
      }
    }
    _mm_sfence();
  }

  if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
コード例 #6
0
static __m128 mm_pow_ps(__m128 a, __m128 b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  __m128 log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    static const ALIGN16_BEG int float_exponent_mask[4] ALIGN16_END = {
        0x7F800000, 0x7F800000, 0x7F800000, 0x7F800000};
    static const ALIGN16_BEG int eight_biased_exponent[4] ALIGN16_END = {
        0x43800000, 0x43800000, 0x43800000, 0x43800000};
    static const ALIGN16_BEG int implicit_leading_one[4] ALIGN16_END = {
        0x43BF8000, 0x43BF8000, 0x43BF8000, 0x43BF8000};
    static const int shift_exponent_into_top_mantissa = 8;
    const __m128 two_n = _mm_and_ps(a, *((__m128*)float_exponent_mask));
    const __m128 n_1 = _mm_castsi128_ps(_mm_srli_epi32(
        _mm_castps_si128(two_n), shift_exponent_into_top_mantissa));
    const __m128 n_0 = _mm_or_ps(n_1, *((__m128*)eight_biased_exponent));
    const __m128 n = _mm_sub_ps(n_0, *((__m128*)implicit_leading_one));

    // Compute y.
    static const ALIGN16_BEG int mantissa_mask[4] ALIGN16_END = {
        0x007FFFFF, 0x007FFFFF, 0x007FFFFF, 0x007FFFFF};
    static const ALIGN16_BEG int zero_biased_exponent_is_one[4] ALIGN16_END = {
        0x3F800000, 0x3F800000, 0x3F800000, 0x3F800000};
    const __m128 mantissa = _mm_and_ps(a, *((__m128*)mantissa_mask));
    const __m128 y =
        _mm_or_ps(mantissa, *((__m128*)zero_biased_exponent_is_one));

    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    static const ALIGN16_BEG float ALIGN16_END C5[4] = {
        -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f, -3.4436006e-2f};
    static const ALIGN16_BEG float ALIGN16_END
        C4[4] = {3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f, 3.1821337e-1f};
    static const ALIGN16_BEG float ALIGN16_END
        C3[4] = {-1.2315303f, -1.2315303f, -1.2315303f, -1.2315303f};
    static const ALIGN16_BEG float ALIGN16_END
        C2[4] = {2.5988452f, 2.5988452f, 2.5988452f, 2.5988452f};
    static const ALIGN16_BEG float ALIGN16_END
        C1[4] = {-3.3241990f, -3.3241990f, -3.3241990f, -3.3241990f};
    static const ALIGN16_BEG float ALIGN16_END
        C0[4] = {3.1157899f, 3.1157899f, 3.1157899f, 3.1157899f};
    const __m128 pol5_y_0 = _mm_mul_ps(y, *((__m128*)C5));
    const __m128 pol5_y_1 = _mm_add_ps(pol5_y_0, *((__m128*)C4));
    const __m128 pol5_y_2 = _mm_mul_ps(pol5_y_1, y);
    const __m128 pol5_y_3 = _mm_add_ps(pol5_y_2, *((__m128*)C3));
    const __m128 pol5_y_4 = _mm_mul_ps(pol5_y_3, y);
    const __m128 pol5_y_5 = _mm_add_ps(pol5_y_4, *((__m128*)C2));
    const __m128 pol5_y_6 = _mm_mul_ps(pol5_y_5, y);
    const __m128 pol5_y_7 = _mm_add_ps(pol5_y_6, *((__m128*)C1));
    const __m128 pol5_y_8 = _mm_mul_ps(pol5_y_7, y);
    const __m128 pol5_y = _mm_add_ps(pol5_y_8, *((__m128*)C0));
    const __m128 y_minus_one =
        _mm_sub_ps(y, *((__m128*)zero_biased_exponent_is_one));
    const __m128 log2_y = _mm_mul_ps(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = _mm_add_ps(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = _mm_mul_ps(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.

    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    static const ALIGN16_BEG float max_input[4] ALIGN16_END = {129.f, 129.f,
                                                               129.f, 129.f};
    static const ALIGN16_BEG float min_input[4] ALIGN16_END = {
        -126.99999f, -126.99999f, -126.99999f, -126.99999f};
    const __m128 x_min = _mm_min_ps(b_log2_a, *((__m128*)max_input));
    const __m128 x_max = _mm_max_ps(x_min, *((__m128*)min_input));
    // Compute n.
    static const ALIGN16_BEG float half[4] ALIGN16_END = {0.5f, 0.5f,
                                                          0.5f, 0.5f};
    const __m128 x_minus_half = _mm_sub_ps(x_max, *((__m128*)half));
    const __m128i x_minus_half_floor = _mm_cvtps_epi32(x_minus_half);
    // Compute 2^n.
    static const ALIGN16_BEG int float_exponent_bias[4] ALIGN16_END = {
        127, 127, 127, 127};
    static const int float_exponent_shift = 23;
    const __m128i two_n_exponent =
        _mm_add_epi32(x_minus_half_floor, *((__m128i*)float_exponent_bias));
    const __m128 two_n =
        _mm_castsi128_ps(_mm_slli_epi32(two_n_exponent, float_exponent_shift));
    // Compute y.
    const __m128 y = _mm_sub_ps(x_max, _mm_cvtepi32_ps(x_minus_half_floor));
    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    static const ALIGN16_BEG float C2[4] ALIGN16_END = {
        3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f, 3.3718944e-1f};
    static const ALIGN16_BEG float C1[4] ALIGN16_END = {
        6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f, 6.5763628e-1f};
    static const ALIGN16_BEG float C0[4] ALIGN16_END = {1.0017247f, 1.0017247f,
                                                        1.0017247f, 1.0017247f};
    const __m128 exp2_y_0 = _mm_mul_ps(y, *((__m128*)C2));
    const __m128 exp2_y_1 = _mm_add_ps(exp2_y_0, *((__m128*)C1));
    const __m128 exp2_y_2 = _mm_mul_ps(exp2_y_1, y);
    const __m128 exp2_y = _mm_add_ps(exp2_y_2, *((__m128*)C0));

    // Combine parts.
    a_exp_b = _mm_mul_ps(exp2_y, two_n);
  }
  return a_exp_b;
}
コード例 #7
0
ファイル: ssemath.cpp プロジェクト: blckshrk/IFT6042
/* natural logarithm computed for 4 simultaneous float
   return NaN for x <= 0
*/
__m128 log_ps(__m128 x) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4si emm0;
    v4sf one = constants::ps_1.ps;

    v4sf invalid_mask = _mm_cmple_ps(x, _mm_setzero_ps());

    x = _mm_max_ps(x, constants::min_norm_pos.ps);  // cut off denormalized stuff

    emm0 = _mm_srli_epi32(_mm_castps_si128(x), 23);
    // keep only the fractional part
    x = _mm_and_ps(x, constants::inv_mant_mask.ps);
    x = _mm_or_ps(x,  constants::ps_0p5.ps);

    emm0 = _mm_sub_epi32(emm0, constants::pi32_0x7f.pi);
    v4sf e = _mm_cvtepi32_ps(emm0);

    e = _mm_add_ps(e, one);

    /* part2:
       if( x < SQRTHF ) {
         e -= 1;
         x = x + x - 1.0;
       } else { x = x - 1.0; }
    */
    v4sf mask = _mm_cmplt_ps(x, constants::cephes_SQRTHF.ps);
    v4sf tmp = _mm_and_ps(x, mask);
    x = _mm_sub_ps(x, one);
    e = _mm_sub_ps(e, _mm_and_ps(one, mask));
    x = _mm_add_ps(x, tmp);

    v4sf z = _mm_mul_ps(x,x);

    v4sf y = constants::cephes_log_p0.ps;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p1.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p2.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p3.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p4.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p5.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p6.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p7.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_log_p8.ps);
    y = _mm_mul_ps(y, x);

    y = _mm_mul_ps(y, z);

    tmp = _mm_mul_ps(e, constants::cephes_log_q1.ps);
    y = _mm_add_ps(y, tmp);

    tmp = _mm_mul_ps(z, constants::ps_0p5.ps);
    y = _mm_sub_ps(y, tmp);

    tmp = _mm_mul_ps(e, constants::cephes_log_q2.ps);
    x = _mm_add_ps(x, y);
    x = _mm_add_ps(x, tmp);
    x = _mm_or_ps(x, invalid_mask); // negative arg will be NAN
    return x;
}
コード例 #8
0
static void ScaleErrorSignalSSE2(AecCore* aec, float ef[2][PART_LEN1]) {
  const __m128 k1e_10f = _mm_set1_ps(1e-10f);
  const __m128 kMu = aec->extended_filter_enabled ? _mm_set1_ps(kExtendedMu)
                                                  : _mm_set1_ps(aec->normal_mu);
  const __m128 kThresh = aec->extended_filter_enabled
                             ? _mm_set1_ps(kExtendedErrorThreshold)
                             : _mm_set1_ps(aec->normal_error_threshold);

  int i;
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    const __m128 xPow = _mm_loadu_ps(&aec->xPow[i]);
    const __m128 ef_re_base = _mm_loadu_ps(&ef[0][i]);
    const __m128 ef_im_base = _mm_loadu_ps(&ef[1][i]);

    const __m128 xPowPlus = _mm_add_ps(xPow, k1e_10f);
    __m128 ef_re = _mm_div_ps(ef_re_base, xPowPlus);
    __m128 ef_im = _mm_div_ps(ef_im_base, xPowPlus);
    const __m128 ef_re2 = _mm_mul_ps(ef_re, ef_re);
    const __m128 ef_im2 = _mm_mul_ps(ef_im, ef_im);
    const __m128 ef_sum2 = _mm_add_ps(ef_re2, ef_im2);
    const __m128 absEf = _mm_sqrt_ps(ef_sum2);
    const __m128 bigger = _mm_cmpgt_ps(absEf, kThresh);
    __m128 absEfPlus = _mm_add_ps(absEf, k1e_10f);
    const __m128 absEfInv = _mm_div_ps(kThresh, absEfPlus);
    __m128 ef_re_if = _mm_mul_ps(ef_re, absEfInv);
    __m128 ef_im_if = _mm_mul_ps(ef_im, absEfInv);
    ef_re_if = _mm_and_ps(bigger, ef_re_if);
    ef_im_if = _mm_and_ps(bigger, ef_im_if);
    ef_re = _mm_andnot_ps(bigger, ef_re);
    ef_im = _mm_andnot_ps(bigger, ef_im);
    ef_re = _mm_or_ps(ef_re, ef_re_if);
    ef_im = _mm_or_ps(ef_im, ef_im_if);
    ef_re = _mm_mul_ps(ef_re, kMu);
    ef_im = _mm_mul_ps(ef_im, kMu);

    _mm_storeu_ps(&ef[0][i], ef_re);
    _mm_storeu_ps(&ef[1][i], ef_im);
  }
  // scalar code for the remaining items.
  {
    const float mu =
        aec->extended_filter_enabled ? kExtendedMu : aec->normal_mu;
    const float error_threshold = aec->extended_filter_enabled
                                      ? kExtendedErrorThreshold
                                      : aec->normal_error_threshold;
    for (; i < (PART_LEN1); i++) {
      float abs_ef;
      ef[0][i] /= (aec->xPow[i] + 1e-10f);
      ef[1][i] /= (aec->xPow[i] + 1e-10f);
      abs_ef = sqrtf(ef[0][i] * ef[0][i] + ef[1][i] * ef[1][i]);

      if (abs_ef > error_threshold) {
        abs_ef = error_threshold / (abs_ef + 1e-10f);
        ef[0][i] *= abs_ef;
        ef[1][i] *= abs_ef;
      }

      // Stepsize factor
      ef[0][i] *= mu;
      ef[1][i] *= mu;
    }
  }
}
コード例 #9
0
ファイル: SimdSse41Detection.cpp プロジェクト: ashh87/Simd
 SIMD_INLINE __m128 ValidSqrt(__m128 value)
 {
     __m128 mask = _mm_cmpgt_ps(value, _mm_set1_ps(0.0f));
     return _mm_sqrt_ps(_mm_or_ps(_mm_and_ps(mask, value), _mm_andnot_ps(mask, _mm_set1_ps(1.0f))));
 }
コード例 #10
0
//----------------------------------------------------------------
// Transforms the AABB vertices to screen space once every frame
// Also performs a coarse depth pre-test
//----------------------------------------------------------------
PreTestResult TransformedAABBoxAVX::TransformAndPreTestAABBox(__m128 xformedPos[], const __m128 cumulativeMatrix[4], const float *pDepthSummary)
{
	// w ends up being garbage, but it doesn't matter - we ignore it anyway.
	__m128 vCenter = _mm_loadu_ps(&mBBCenter.x);
	__m128 vHalf   = _mm_loadu_ps(&mBBHalf.x);

	__m128 vMin    = _mm_sub_ps(vCenter, vHalf);
	__m128 vMax    = _mm_add_ps(vCenter, vHalf);

	// transforms
	__m128 xRow[2], yRow[2], zRow[2];
	xRow[0] = _mm_shuffle_ps(vMin, vMin, 0x00) * cumulativeMatrix[0];
	xRow[1] = _mm_shuffle_ps(vMax, vMax, 0x00) * cumulativeMatrix[0];
	yRow[0] = _mm_shuffle_ps(vMin, vMin, 0x55) * cumulativeMatrix[1];
	yRow[1] = _mm_shuffle_ps(vMax, vMax, 0x55) * cumulativeMatrix[1];
	zRow[0] = _mm_shuffle_ps(vMin, vMin, 0xaa) * cumulativeMatrix[2];
	zRow[1] = _mm_shuffle_ps(vMax, vMax, 0xaa) * cumulativeMatrix[2];

	__m128 zAllIn = _mm_castsi128_ps(_mm_set1_epi32(~0));
	__m128 screenMin = _mm_set1_ps(FLT_MAX);
	__m128 screenMax = _mm_set1_ps(-FLT_MAX);

	for(UINT i = 0; i < AABB_VERTICES; i++)
	{
		// Transform the vertex
		__m128 vert = cumulativeMatrix[3];
		vert += xRow[sBBxInd[i]];
		vert += yRow[sBByInd[i]];
		vert += zRow[sBBzInd[i]];

		// We have inverted z; z is in front of near plane iff z <= w.
		__m128 vertZ = _mm_shuffle_ps(vert, vert, 0xaa); // vert.zzzz
		__m128 vertW = _mm_shuffle_ps(vert, vert, 0xff); // vert.wwww
		__m128 zIn = _mm_cmple_ps(vertZ, vertW);
		zAllIn = _mm_and_ps(zAllIn, zIn);

		// project
		xformedPos[i] = _mm_div_ps(vert, vertW);
		
	    // update bounds
	    screenMin = _mm_min_ps(screenMin, xformedPos[i]);
	    screenMax = _mm_max_ps(screenMax, xformedPos[i]);
	}

	// if any of the verts are z-clipped, we (conservatively) say the box is in
	if(_mm_movemask_ps(zAllIn) != 0xf)
		return ePT_VISIBLE;

	// Clip against screen bounds
	screenMin = _mm_max_ps(screenMin, _mm_setr_ps(0.0f, 0.0f, 0.0f, -FLT_MAX));
	screenMax = _mm_min_ps(screenMax, _mm_setr_ps((float) (SCREENW - 1), (float) (SCREENH - 1), 1.0f, FLT_MAX));

	// Quick rejection test
	if(_mm_movemask_ps(_mm_cmplt_ps(screenMax, screenMin)))
		return ePT_INVISIBLE;

	// Prepare integer bounds
	__m128 minMaxXY = _mm_shuffle_ps(screenMin, screenMax, 0x44); // minX,minY,maxX,maxY
	__m128i minMaxXYi = _mm_cvtps_epi32(minMaxXY);
	__m128i minMaxXYis = _mm_srai_epi32(minMaxXYi, 3);

	__m128 maxZ = _mm_shuffle_ps(screenMax, screenMax, 0xaa);

	// Traverse all 8x8 blocks covered by 2d screen-space BBox;
	// if we know for sure that this box is behind the geometry we know is there,
	// we can stop.
	int rX0 = minMaxXYis.m128i_i32[0];
	int rY0 = minMaxXYis.m128i_i32[1];
	int rX1 = minMaxXYis.m128i_i32[2];
	int rY1 = minMaxXYis.m128i_i32[3];

	__m128 anyCloser = _mm_setzero_ps();
	for(int by = rY0; by <= rY1; by++)
	{
		const float *srcRow = pDepthSummary + by * (SCREENW/BLOCK_SIZE);

		// If for any 8x8 block, maxZ is not less than (=behind) summarized
		// min Z, box might be visible.
		for(int bx = rX0; bx <= rX1; bx++)
		{
			anyCloser = _mm_or_ps(anyCloser, _mm_cmpnlt_ss(maxZ, _mm_load_ss(&srcRow[bx])));
		}

		if(_mm_movemask_ps(anyCloser))
		{
			return ePT_UNSURE; // okay, box might be in
		}
	}

	// If we get here, we know for sure that the box is fully behind the stuff in the
	// depth buffer.
	return ePT_INVISIBLE;
}
コード例 #11
0
ファイル: overexposed.c プロジェクト: PolarFox/darktable
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out)
{
  dt_develop_t *dev = self->dev;

  const int ch = piece->colors;

  // FIXME: turn off the module instead?
  if(!dev->overexposed.enabled || !dev->gui_attached)
  {
    memcpy(ovoid, ivoid, (size_t)roi_out->width*roi_out->height*sizeof(float)*ch);
    return;
  }

  const __m128 upper = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f,
                                  dev->overexposed.upper / 100.0f);
  const __m128 lower = _mm_set_ps(FLT_MAX,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f,
                                  dev->overexposed.lower / 100.0f);

  const int colorscheme = dev->overexposed.colorscheme;
  const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]);
  const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]);

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(ovoid) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width;

    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
    {
      const __m128 pixel = _mm_load_ps(in);

      __m128 isoe = _mm_cmpge_ps(pixel, upper);
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));
      isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe));

      __m128 isue = _mm_cmple_ps(pixel, lower);
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));
      isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue));

      __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel),
                                _mm_and_ps(isoe, upper_color));

      result = _mm_or_ps(_mm_andnot_ps(isue, result),
                         _mm_and_ps(isue, lower_color));

      _mm_stream_ps(out, result);
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
コード例 #12
0
static void compute_step_tv_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], unsigned x, unsigned y, double *tv) {
        const __m128 minf = _mm_set_ps1(INFINITY);
        const __m128 mzero = _mm_set_ps1(0.);

        __m128 g_xs[3] = {0};
        __m128 g_ys[3] = {0};
        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];
                __m128 here = _mm_load_ps(p(aux->fdata, x, y, w, h));
                // forward gradient x
                g_xs[c] = _mm_loadu_ps(p(aux->fdata, x+1, y, w, h)) - here;
                // forward gradient y
                g_ys[c] = _mm_loadu_ps(p(aux->fdata, x, y+1, w, h)) - here;
        }
        // norm
        __m128 g_norm = mzero;
        for(unsigned c = 0; c < nchannel; c++) {
                g_norm += SQR(g_xs[c]);
                g_norm += SQR(g_ys[c]);
        }
        g_norm = _mm_sqrt_ps(g_norm);

        float alpha = 1./sqrtf(nchannel);
        *tv += alpha * g_norm[0];
        *tv += alpha * g_norm[1];
        *tv += alpha * g_norm[2];
        *tv += alpha * g_norm[3];

        __m128 malpha = _mm_set_ps1(alpha);

        // set zeroes to infinity
        g_norm = _mm_or_ps(g_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g_norm, mzero)));

        // compute derivatives
        for(unsigned c = 0; c < nchannel; c++) {
                __m128 g_x = g_xs[c];
                __m128 g_y = g_ys[c];
                struct aux *aux = &auxs[c];

                // N.B. for numerical stability and same exact result as the c version,
                // we must calculate the objective gradient at x+1 before x
                {
                        float *pobj_r = p(aux->obj_gradient, x+1, y, w, h);
                        __m128 obj_r = _mm_loadu_ps(pobj_r);
                        obj_r += malpha * g_x / g_norm;
                        _mm_storeu_ps(pobj_r, obj_r);
                }

                {
                        float *pobj = p(aux->obj_gradient, x, y, w, h);
                        __m128 obj = _mm_load_ps(pobj);
                        obj += malpha * -(g_x + g_y) / g_norm;
                        _mm_store_ps(pobj, obj);
                }

                {
                        float *pobj_b = p(aux->obj_gradient, x, y+1, w, h);
                        __m128 obj_b = _mm_load_ps(pobj_b);
                        obj_b += malpha * g_y / g_norm;
                        _mm_store_ps(pobj_b, obj_b);
                }
        }
        // store
        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];
                _mm_store_ps(p(aux->temp[0], x, y, w, h), g_xs[c]);
                _mm_store_ps(p(aux->temp[1], x, y, w, h), g_ys[c]);
        }
}
コード例 #13
0
static void compute_step_tv2_inner_simd(unsigned w, unsigned h, unsigned nchannel, struct aux auxs[nchannel], float alpha, unsigned x, unsigned y, double *tv2) {
        __m128 g_xxs[3] = {0};
        __m128 g_xy_syms[3] = {0};
        __m128 g_yys[3] = {0};

        const __m128 mtwo = _mm_set_ps1(2.);
        const __m128 minf = _mm_set_ps1(INFINITY);
        const __m128 mzero = _mm_set_ps1(0.);

        __m128 malpha = _mm_set_ps1(alpha * 1./sqrtf(nchannel));

        for(unsigned c = 0; c < nchannel; c++) {
                struct aux *aux = &auxs[c];

                __m128 g_x = _mm_load_ps(p(aux->temp[0], x, y, w, h));
                __m128 g_y = _mm_load_ps(p(aux->temp[1], x, y, w, h));

                // backward x
                g_xxs[c] = g_x - _mm_loadu_ps(p(aux->temp[0], x-1, y, w, h));
                // backward x
                __m128 g_yx = g_y - _mm_loadu_ps(p(aux->temp[1], x-1, y, w, h));
                // backward y
                __m128 g_xy = g_x - _mm_load_ps(p(aux->temp[0], x, y-1, w, h));
                // backward y
                g_yys[c] = g_y - _mm_load_ps(p(aux->temp[1], x, y-1, w, h));
                // symmetrize
                g_xy_syms[c] = (g_xy + g_yx) / mtwo;
        }

        // norm
        __m128 g2_norm = mzero;
        for(unsigned c = 0; c < nchannel; c++) {
                g2_norm += SQR(g_xxs[c]) + mtwo * SQR(g_xy_syms[c]) + SQR(g_yys[c]);
        }
        g2_norm = _mm_sqrt_ps(g2_norm);

        __m128 alpha_norm = malpha * g2_norm;
        *tv2 += alpha_norm[0];
        *tv2 += alpha_norm[1];
        *tv2 += alpha_norm[2];
        *tv2 += alpha_norm[3];

        // set zeroes to infinity
        g2_norm = _mm_or_ps(g2_norm, _mm_and_ps(minf, _mm_cmpeq_ps(g2_norm, mzero)));

        for(unsigned c = 0; c < nchannel; c++) {
                __m128 g_xx = g_xxs[c];
                __m128 g_yy = g_yys[c];
                __m128 g_xy_sym = g_xy_syms[c];
                struct aux *aux = &auxs[c];

                // N.B. for same exact result as the c version,
                // we must calculate the objective gradient from right to left
                {
                        float *pobj_ur = p(aux->obj_gradient, x+1, y-1, w, h);
                        __m128 obj_ur = _mm_loadu_ps(pobj_ur);
                        obj_ur += malpha * ((-g_xy_sym) / g2_norm);
                        _mm_storeu_ps(pobj_ur, obj_ur);
                }

                {
                        float *pobj_r = p(aux->obj_gradient, x+1, y, w, h);
                        __m128 obj_r = _mm_loadu_ps(pobj_r);
                        obj_r += malpha * ((g_xy_sym + g_xx) / g2_norm);
                        _mm_storeu_ps(pobj_r, obj_r);
                }

                {
                        float *pobj_u = p(aux->obj_gradient, x, y-1, w, h);
                        __m128 obj_u = _mm_load_ps(pobj_u);
                        obj_u += malpha * ((g_yy + g_xy_sym) / g2_norm);
                        _mm_store_ps(pobj_u, obj_u);
                }

                {
                        float *pobj = p(aux->obj_gradient, x, y, w, h);
                        __m128 obj = _mm_load_ps(pobj);
                        obj += malpha * (-(mtwo * g_xx + mtwo * g_xy_sym + mtwo * g_yy) / g2_norm);
                        _mm_store_ps(pobj, obj);
                }

                {
                        float *pobj_b = p(aux->obj_gradient, x, y+1, w, h);
                        __m128 obj_b = _mm_load_ps(pobj_b);
                        obj_b += malpha * ((g_yy + g_xy_sym) / g2_norm);
                        _mm_store_ps(pobj_b, obj_b);
                }

                {
                        float *pobj_l = p(aux->obj_gradient, x-1, y, w, h);
                        __m128 obj_l = _mm_loadu_ps(pobj_l);
                        obj_l += malpha * ((g_xy_sym + g_xx) / g2_norm);
                        _mm_storeu_ps(pobj_l, obj_l);
                }

                {
                        float *pobj_lb = p(aux->obj_gradient, x-1, y+1, w, h);
                        __m128 obj_lb = _mm_loadu_ps(pobj_lb);
                        obj_lb += malpha * ((-g_xy_sym) / g2_norm);
                        _mm_storeu_ps(pobj_lb, obj_lb);
                }
        }
}
コード例 #14
0
mlib_status
F_NAME(
    mlib_f32 *dst,
    const mlib_f32 *src,
    mlib_s32 dlb,
    mlib_s32 slb,
    mlib_s32 wid,
    mlib_s32 hgt)
{
    mlib_u8 *buff, *buff1;
    mlib_u8 *sl, *sp0, *sp1, *sp2, *sp3, *dl;
    __m128 *dp0, *dp1;
    __m128 aa, bb, c0, c1, c2, cc, d0, d1, d2, dd, r0, r1, t0, t1;
    __m128 e_mask, mask;
    mlib_s32 i, j, wid16, tail;

    wid = (wid - 2) * SSIZE;
    wid16 = (wid + 15) & ~15;
    buff = __mlib_malloc(2 * wid16);
    buff1 = buff + wid16;

    sl = (mlib_u8 *)src;
    /* dst ptrs skip top j and left col */
    dl = (mlib_u8 *)dst + dlb + SSIZE;

    tail = wid & 15;

    ((mlib_d64 *)&e_mask)[0] =
        ((mlib_d64 *)((__m128 *) mlib_mask128i_arr + tail))[0];
    ((mlib_d64 *)&e_mask)[1] =
        ((mlib_d64 *)((__m128 *) mlib_mask128i_arr + tail))[1];

    sp0 = buff;
    sp1 = buff1;
    sp2 = sl;
    sp3 = sp2 + slb;
    sl += 2 * slb;


    for (i = 0; i < wid; i += 16) {
        c0 = _mm_loadu_ps((mlib_f32 *)sp2);
        c1 = _mm_loadu_ps((mlib_f32 *)(sp2 + SSIZE));
        c2 = _mm_loadu_ps((mlib_f32 *)(sp2 + 2 * SSIZE));
        d0 = _mm_loadu_ps((mlib_f32 *)sp3);
        d1 = _mm_loadu_ps((mlib_f32 *)(sp3 + SSIZE));
        d2 = _mm_loadu_ps((mlib_f32 *)(sp3 + 2 * SSIZE));

        cc = C_COMP(c0, c1);
        dd = C_COMP(d0, d1);
        cc = C_COMP(cc, c2);
        dd = C_COMP(dd, d2);


        _mm_storeu_ps((mlib_f32 *)sp0, cc);
        _mm_storeu_ps((mlib_f32 *)sp1, dd);

        sp0 += 16;
        sp1 += 16;
        sp2 += 16;
        sp3 += 16;
    }

    for (j = 0; j <= (hgt - 2 - 2); j += 2) {
        dp0 = (void *)dl;
        dp1 = (void *)(dl + dlb);
        sp0 = buff;
        sp1 = buff1;
        sp2 = sl;
        sp3 = sp2 + slb;

        /*
         *    line0:     aa
         *    line1:     bb
         *    line2:  c0 c1 c2
         *    line3:  d0 d1 d2
         */

        for (i = 0; i <= wid - 16; i += 16) {

            aa = _mm_loadu_ps((mlib_f32 *)sp0);
            bb = _mm_loadu_ps((mlib_f32 *)sp1);
            c0 = _mm_loadu_ps((mlib_f32 *)sp2);
            c1 = _mm_loadu_ps((mlib_f32 *)(sp2 + SSIZE));
            c2 = _mm_loadu_ps((mlib_f32 *)(sp2 + 2 * SSIZE));
            d0 = _mm_loadu_ps((mlib_f32 *)sp3);
            d1 = _mm_loadu_ps((mlib_f32 *)(sp3 + SSIZE));
            d2 = _mm_loadu_ps((mlib_f32 *)(sp3 + 2 * SSIZE));

            cc = C_COMP(c0, c1);
            dd = C_COMP(d0, d1);
            cc = C_COMP(cc, c2);
            dd = C_COMP(dd, d2);

            bb = C_COMP(bb, cc);
            r0 = C_COMP(aa, bb);
            r1 = C_COMP(bb, dd);


            _mm_storeu_ps((mlib_f32 *)sp0, cc);
            _mm_storeu_ps((mlib_f32 *)sp1, dd);

            _mm_storeu_ps((mlib_f32 *)dp0, r0);
            dp0++;
            _mm_storeu_ps((mlib_f32 *)dp1, r1);
            dp1++;

            sp0 += 16;
            sp1 += 16;
            sp2 += 16;
            sp3 += 16;
        }

        if (tail) {
            aa = _mm_loadu_ps((mlib_f32 *)sp0);
            bb = _mm_loadu_ps((mlib_f32 *)sp1);
            c0 = _mm_loadu_ps((mlib_f32 *)sp2);
            c1 = _mm_loadu_ps((mlib_f32 *)(sp2 + SSIZE));
            c2 = _mm_loadu_ps((mlib_f32 *)(sp2 + 2 * SSIZE));
            d0 = _mm_loadu_ps((mlib_f32 *)sp3);
            d1 = _mm_loadu_ps((mlib_f32 *)(sp3 + SSIZE));
            d2 = _mm_loadu_ps((mlib_f32 *)(sp3 + 2 * SSIZE));

            cc = C_COMP(c0, c1);
            dd = C_COMP(d0, d1);
            cc = C_COMP(cc, c2);
            dd = C_COMP(dd, d2);

            bb = C_COMP(bb, cc);
            r0 = C_COMP(aa, bb);
            r1 = C_COMP(bb, dd);

            _mm_storeu_ps((mlib_f32 *)sp0, cc);
            _mm_storeu_ps((mlib_f32 *)sp1, dd);

            t0 = _mm_loadu_ps((mlib_f32 *)dp0);
            t1 = _mm_loadu_ps((mlib_f32 *)dp1);
            t0 =
                _mm_or_ps(_mm_and_ps(e_mask, r0),
                          _mm_andnot_ps(e_mask, t0));
            t1 =
                _mm_or_ps(_mm_and_ps(e_mask, r1),
                          _mm_andnot_ps(e_mask, t1));
            _mm_storeu_ps((mlib_f32 *)dp0, t0);
            _mm_storeu_ps((mlib_f32 *)dp1, t1);
        }

        sl += 2 * slb;
        dl += 2 * dlb;
    }

    /* last line */

    if (j == (hgt - 3)) {
        dp0 = (void *)dl;
        dp1 = (void *)(dl + dlb);
        sp0 = buff;
        sp1 = buff1;
        sp2 = sl;

        for (i = 0; i <= wid - 16; i += 16) {
            aa = _mm_loadu_ps((mlib_f32 *)sp0);
            bb = _mm_loadu_ps((mlib_f32 *)sp1);
            c0 = _mm_loadu_ps((mlib_f32 *)sp2);
            c1 = _mm_loadu_ps((mlib_f32 *)(sp2 + SSIZE));
            c2 = _mm_loadu_ps((mlib_f32 *)(sp2 + 2 * SSIZE));

            cc = C_COMP(c0, c1);
            cc = C_COMP(cc, c2);

            r0 = C_COMP(aa, bb);
            r0 = C_COMP(r0, cc);

            _mm_storeu_ps((mlib_f32 *)dp0, r0);
            dp0++;

            sp0 += 16;
            sp1 += 16;
            sp2 += 16;
        }

        if (tail) {
            aa = _mm_loadu_ps((mlib_f32 *)sp0);
            bb = _mm_loadu_ps((mlib_f32 *)sp1);
            c0 = _mm_loadu_ps((mlib_f32 *)sp2);
            c1 = _mm_loadu_ps((mlib_f32 *)(sp2 + SSIZE));
            c2 = _mm_loadu_ps((mlib_f32 *)(sp2 + 2 * SSIZE));

            c1 = C_COMP(c0, c1);
            cc = C_COMP(c1, c2);

            r0 = C_COMP(aa, bb);
            r0 = C_COMP(r0, cc);

            t0 = _mm_loadu_ps((mlib_f32 *)dp0);
            t0 =
                _mm_or_ps(_mm_and_ps(e_mask, r0),
                          _mm_andnot_ps(e_mask, t0));
            _mm_storeu_ps((mlib_f32 *)dp0, t0);
        }
    }

    __mlib_free(buff);

    return (MLIB_SUCCESS);
}
コード例 #15
0
BOOST_FORCEINLINE __m128  __vectorcall operator | ( __m128  const left, __m128  const right ) {
    return _mm_or_ps    ( left, right );
}
コード例 #16
0
void
transform8_srgb_avx(ThreadInfo* t)
{
	RS_IMAGE16 *input = t->input;
	GdkPixbuf *output = t->output;
	RS_MATRIX3 *matrix = t->matrix;
	gint x,y;
	gint width;

	float mat_ps[4*4*3] __attribute__ ((aligned (16)));
	for (x = 0; x < 4; x++ ) {
		mat_ps[x] = matrix->coeff[0][0];
		mat_ps[x+4] = matrix->coeff[0][1];
		mat_ps[x+8] = matrix->coeff[0][2];
		mat_ps[12+x] = matrix->coeff[1][0];
		mat_ps[12+x+4] = matrix->coeff[1][1];
		mat_ps[12+x+8] = matrix->coeff[1][2];
		mat_ps[24+x] = matrix->coeff[2][0];
		mat_ps[24+x+4] = matrix->coeff[2][1];
		mat_ps[24+x+8] = matrix->coeff[2][2];
	}
	
	int start_x = t->start_x;
	/* Always have aligned input and output adress */
	if (start_x & 3)
		start_x = ((start_x) / 4) * 4;
	
	int complete_w = t->end_x - start_x;
	/* If width is not multiple of 4, check if we can extend it a bit */
	if (complete_w & 3)
	{
		if ((t->end_x+4) < input->w)
			complete_w = (((complete_w + 3) / 4) * 4);
	}
	
	for(y=t->start_y ; y<t->end_y ; y++)
	{
		gushort *i = GET_PIXEL(input, start_x, y);
		guchar *o = GET_PIXBUF_PIXEL(output, start_x, y);
		gboolean aligned_write = !((guintptr)(o)&0xf);

		width = complete_w >> 2;

		while(width--)
		{
			/* Load and convert to float */
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_load_si128((__m128i*)i); // Load two pixels
			__m128i in2 = _mm_load_si128((__m128i*)i+1); // Load two pixels
			_mm_prefetch(i + 64, _MM_HINT_NTA);
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128i p2 =_mm_unpackhi_epi16(in, zero);
			__m128i p3 =_mm_unpacklo_epi16(in2, zero);
			__m128i p4 =_mm_unpackhi_epi16(in2, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);
			__m128 p2f  = _mm_cvtepi32_ps(p2);
			__m128 p3f  = _mm_cvtepi32_ps(p3);
			__m128 p4f  = _mm_cvtepi32_ps(p4);
			
			/* Convert to planar */
			__m128 g1g0r1r0 = _mm_unpacklo_ps(p1f, p2f);
			__m128 b1b0 = _mm_unpackhi_ps(p1f, p2f);
			__m128 g3g2r3r2 = _mm_unpacklo_ps(p3f, p4f);
			__m128 b3b2 = _mm_unpackhi_ps(p3f, p4f);
			__m128 r = _mm_movelh_ps(g1g0r1r0, g3g2r3r2);
			__m128 g = _mm_movehl_ps(g3g2r3r2, g1g0r1r0);
			__m128 b = _mm_movelh_ps(b1b0, b3b2);

			/* Apply matrix to convert to sRGB */
			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			/* Normalize to 0->1 and clamp */
			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r2)));
			g = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, g2)));
			b = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, b2)));

			/* Apply Gamma */
			/* Calculate values to be used if larger than junction point */
			__m128 mul_over = _mm_load_ps(_srb_mul_over);
			__m128 sub_over = _mm_load_ps(_srb_sub_over);
			__m128 pow_over = _mm_load_ps(_srb_pow_over);
			__m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(r, pow_over)), sub_over);
			__m128 g_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(g, pow_over)), sub_over);
			__m128 b_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(b, pow_over)), sub_over);

			/* Create mask for values smaller than junction point */
			__m128 junction = _mm_load_ps(_junction_ps);
			__m128 mask_r = _mm_cmplt_ps(r, junction);
			__m128 mask_g = _mm_cmplt_ps(g, junction);
			__m128 mask_b = _mm_cmplt_ps(b, junction);

			/* Calculate value to be used if under junction */
			__m128 mul_under = _mm_load_ps(_srb_mul_under);
			__m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, r));
			__m128 g_mul = _mm_and_ps(mask_g, _mm_mul_ps(mul_under, g));
			__m128 b_mul = _mm_and_ps(mask_b, _mm_mul_ps(mul_under, b));

			/* Select the value to be used based on the junction mask and scale to 8 bit */
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, _mm_andnot_ps(mask_r, r_gam)));
			g = _mm_mul_ps(upscale, _mm_or_ps(g_mul, _mm_andnot_ps(mask_g, g_gam)));
			b = _mm_mul_ps(upscale, _mm_or_ps(b_mul, _mm_andnot_ps(mask_b, b_gam)));
			
			/* Convert to 8 bit unsigned  and interleave*/
			__m128i r_i = _mm_cvtps_epi32(r);
			__m128i g_i = _mm_cvtps_epi32(g);
			__m128i b_i = _mm_cvtps_epi32(b);
			
			r_i = _mm_packs_epi32(r_i, r_i);
			g_i = _mm_packs_epi32(g_i, g_i);
			b_i = _mm_packs_epi32(b_i, b_i);

			/* Set alpha value to 255 and store */
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			__m128i rg_i = _mm_unpacklo_epi16(r_i, g_i);
			__m128i bb_i = _mm_unpacklo_epi16(b_i, b_i);
			p1 = _mm_unpacklo_epi32(rg_i, bb_i);
			p2 = _mm_unpackhi_epi32(rg_i, bb_i);
	
			p1 = _mm_or_si128(alpha_mask, _mm_packus_epi16(p1, p2));

			if (aligned_write)
				_mm_store_si128((__m128i*)o, p1);
			else
				_mm_storeu_si128((__m128i*)o, p1);

			i += 16;
			o += 16;
		}

		/* Process remaining pixels */
		width = complete_w & 3;

		while(width--)
		{
			__m128i zero = _mm_setzero_si128();
			__m128i in = _mm_loadl_epi64((__m128i*)i); // Load one pixel
			__m128i p1 =_mm_unpacklo_epi16(in, zero);
			__m128 p1f  = _mm_cvtepi32_ps(p1);

			/* Splat r,g,b */
			__m128 r =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(0,0,0,0));
			__m128 g =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(1,1,1,1));
			__m128 b =  _mm_shuffle_ps(p1f, p1f, _MM_SHUFFLE(2,2,2,2));

			__m128 r2 = sse_matrix3_mul(mat_ps, r, g, b);
			__m128 g2 = sse_matrix3_mul(&mat_ps[12], r, g, b);
			__m128 b2 = sse_matrix3_mul(&mat_ps[24], r, g, b);

			r = _mm_unpacklo_ps(r2, g2);	// RR GG RR GG
			r = _mm_movelh_ps(r, b2);		// RR GG BB BB

			__m128 normalize = _mm_load_ps(_normalize);
			__m128 max_val = _mm_load_ps(_ones_ps);
			__m128 min_val = _mm_setzero_ps();
			r = _mm_min_ps(max_val, _mm_max_ps(min_val, _mm_mul_ps(normalize, r)));
			__m128 mul_over = _mm_load_ps(_srb_mul_over);
			__m128 sub_over = _mm_load_ps(_srb_sub_over);
			__m128 pow_over = _mm_load_ps(_srb_pow_over);
			__m128 r_gam = _mm_sub_ps(_mm_mul_ps( mul_over, _mm_fastpow_ps(r, pow_over)), sub_over);
			__m128 junction = _mm_load_ps(_junction_ps);
			__m128 mask_r = _mm_cmplt_ps(r, junction);
			__m128 mul_under = _mm_load_ps(_srb_mul_under);
			__m128 r_mul = _mm_and_ps(mask_r, _mm_mul_ps(mul_under, r));
			__m128 upscale = _mm_load_ps(_8bit);
			r = _mm_mul_ps(upscale, _mm_or_ps(r_mul, _mm_andnot_ps(mask_r, r_gam)));
			
			/* Convert to 8 bit unsigned */
			zero = _mm_setzero_si128();
			__m128i r_i = _mm_cvtps_epi32(r);
			/* To 16 bit signed */
			r_i = _mm_packs_epi32(r_i, zero);
			/* To 8 bit unsigned - set alpha channel*/
			__m128i alpha_mask = _mm_load_si128((__m128i*)_alpha_mask);
			r_i = _mm_or_si128(alpha_mask, _mm_packus_epi16(r_i, zero));
			*(int*)o = _mm_cvtsi128_si32(r_i);
			i+=4;
			o+=4;
		}
	}
}
コード例 #17
0
ファイル: test_sse1.cpp プロジェクト: AVert/emscripten
int main()
{
	float *arr = get_arr(); // [4, 3, 2, 1]
	float *uarr = get_uarr(); // [5, 4, 3, 2]
	float *arr2 = get_arr2(); // [4, 3, 2, 1]
	float *uarr2 = get_uarr2(); // [5, 4, 3, 2]
	__m128 a = get_a(); // [8, 6, 4, 2]
	__m128 b = get_b(); // [1, 2, 3, 4]

	// Check that test data is like expected.
	Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned.
	Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16.
	Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned.

	// Test that aeq itself works and does not trivially return true on everything.
	Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false);
#ifdef TEST_M64
	Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false);
#endif
	// SSE1 Load instructions:	
	aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address.
	aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide.
	aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest.
	aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1
	aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest.
	aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest.
	aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order.
	aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address.

	// SSE1 Set instructions:
	aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands.
	aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded.
	aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher.
	aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1
	aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order.
	aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register.

	// SSE1 Move instructions:
	aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b.
	aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output.
	aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output.

	// SSE1 Store instructions:
#ifdef TEST_M64
	/*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value.
	/*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL;       _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64.
#endif
	_mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address.
	_mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. 
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory.
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory.
	_mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory.
	_mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output.
	_mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address.
#ifdef TEST_M64
	/*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint.
#endif
	_mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint.

	// SSE1 Arithmetic instructions:
	aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add.
	aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a.
	aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div.
	aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a.
	aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul.
	aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a.
#ifdef TEST_M64
	__m64 m1 = get_m1();
	/*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts.
	/*M64*/aeq64(    _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16.
	__m64 m2 = get_m2();
	/*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar.
	/*M64*/aeq64(  _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8.
#endif
	aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub.
	aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a.

	// SSE1 Elementary Math functions:
#ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass.
	aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x.
	aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged.
	aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x).
	aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged.
#endif
	aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x).
	aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged.

	__m128 i1 = get_i1();
	__m128 i2 = get_i2();

	// SSE1 Logical instructions:
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND
	aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2
	aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR
	aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR
#endif

	// SSE1 Compare instructions:
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp ==
	aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged.
	aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >=
	aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged.
	aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp >
	aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged.
	aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <=
	aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged.
	aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <
	aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged.
	aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp !=
	aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged.
	aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >=
	aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged.
	aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >
	aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged.
	aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <=
	aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged.
	aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <
	aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged.

	__m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN]
	__m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0]
	aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan.
	aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged.
	// Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx
	aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan.
#ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these.
	aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged.
#endif

	Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int.
	Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int.
	Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int.
	Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int.
	Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int.
	Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int.

	// The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP
	// exception when one of the input operands is either a QNaN or a SNaN.
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1);
#endif
	Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0);
	Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0);
	Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1);
	Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1);
#ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly.
	Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0);
#endif

	// SSE1 Convert instructions:
	__m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 e = get_e(); // [INF, -INF, 2.5, 3.5]
	__m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808]
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128.
	/*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64.
#endif
	aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128.
	aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss.
#ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions.
	Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int.
	Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32.
#endif
#ifdef TEST_M64
	/*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged.
	/*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float.
	/*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128.
	/*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64.
	/*M64*/aeq64(_mm_cvtps_pi8(c),  0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64.
	/*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128.
	/*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128.
#endif
	aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged.
	Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float.
	Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64.
#endif
	Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32.
	Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32.
#ifdef TEST_M64
	/*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64.
#endif
	Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64.

#ifndef __EMSCRIPTEN__ // TODO: Not implemented.
	// SSE1 General support:
	unsigned int mask = _MM_GET_EXCEPTION_MASK();
	_MM_SET_EXCEPTION_MASK(mask);
	unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE();
	_MM_SET_FLUSH_ZERO_MODE(flushZeroMode);
	unsigned int roundingMode = _MM_GET_ROUNDING_MODE();
	_MM_SET_ROUNDING_MODE(roundingMode);
	unsigned int csr = _mm_getcsr();
	_mm_setcsr(csr);
	unsigned char dummyData[4096];
	_mm_prefetch(dummyData, _MM_HINT_T0);
	_mm_prefetch(dummyData, _MM_HINT_T1);
	_mm_prefetch(dummyData, _MM_HINT_T2);
	_mm_prefetch(dummyData, _MM_HINT_NTA);
	_mm_sfence();
#endif

	// SSE1 Misc instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64.
	/*M64*/Assert(     _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8.
#endif
	Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels.

	// SSE1 Probability/Statistics instructions:
#ifdef TEST_M64
	/*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s.
	/*M64*/aeq64(    _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16.
	/*M64*/aeq64(_mm_avg_pu8(m1, m2),  0x7FEE9D4D43A23548ULL); // 8-way average uint8s.
	/*M64*/aeq64(   _m_pavgb(m1, m2),  0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8.

	// SSE1 Special Math instructions:
	/*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16.
	/*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8.
	/*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s.
	/*M64*/aeq64(   _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16.
	/*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s.
	/*M64*/aeq64(  _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8.
#endif
	// a = [8, 6, 4, 2], b = [1, 2, 3, 4]
	aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max.
	aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged.
	aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min.
	aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged.

	// SSE1 Swizzle instructions:
#ifdef TEST_M64
	/*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64.
	/*M64*/Assert(       _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16.
	/*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64.
	/*M64*/aeq64(      _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16.
	/*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64.
	/*M64*/aeq64(       _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16.
#endif
	aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f);
	aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f);
	aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f);

	// Transposing a matrix via the xmmintrin.h-provided intrinsic.
	__m128 c0 = a; // [8, 6, 4, 2]
	__m128 c1 = b; // [1, 2, 3, 4]
	__m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5]
	__m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5]
	_MM_TRANSPOSE4_PS(c0, c1, c2, c3);
	aeq(c0, 2.5f, 4.5f, 4.f, 2.f);
	aeq(c1, 4.5f, 3.5f, 3.f, 4.f);
	aeq(c2, 6.5f, 2.5f, 2.f, 6.f);
	aeq(c3, 8.5f, 1.5f, 1.f, 8.f);

	// All done!
	if (numFailures == 0)
		printf("Success!\n");
	else
		printf("%d tests failed!\n", numFailures);
}
コード例 #18
0
ファイル: fwdback.c プロジェクト: TuftsBCB/SMURFBuild
static int
forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc)
{
  register __m128 mpv, dpv, ipv;   /* previous row values                                       */
  register __m128 sv;		   /* temp storage of 1 curr row value in progress              */
  register __m128 dcv;		   /* delayed storage of D(i,q+1)                               */
  register __m128 xEv;		   /* E state: keeps max for Mk->E as we go                     */
  register __m128 xBv;		   /* B state: splatted vector of B[i-1] for B->Mk calculations */
  __m128   zerov;		   /* splatted 0.0's in a vector                                */
  float    xN, xE, xB, xC, xJ;	   /* special states' scores                                    */
  int i;			   /* counter over sequence positions 1..L                      */
  int q;			   /* counter over quads 0..nq-1                                */
  int j;			   /* counter over DD iterations (4 is full serialization)      */
  int Q       = p7O_NQF(om->M);	   /* segment length: # of vectors                              */
  __m128 *dpc = ox->dpf[0];        /* current row, for use in {MDI}MO(dpp,q) access macro       */
  __m128 *dpp;                     /* previous row, for use in {MDI}MO(dpp,q) access macro      */
  __m128 *rp;			   /* will point at om->rfv[x] for residue x[i]                 */
  __m128 *tp;			   /* will point into (and step thru) om->tfv                   */

  /* Initialization. */
  ox->M  = om->M;
  ox->L  = L;
  ox->has_own_scales = TRUE; 	/* all forward matrices control their own scalefactors */
  zerov  = _mm_setzero_ps();
  for (q = 0; q < Q; q++)
    MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov;
  xE    = ox->xmx[p7X_E] = 0.;
  xN    = ox->xmx[p7X_N] = 1.;
  xJ    = ox->xmx[p7X_J] = 0.;
  xB    = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE];
  xC    = ox->xmx[p7X_C] = 0.;

  ox->xmx[p7X_SCALE] = 1.0;
  ox->totscale       = 0.0;

#if p7_DEBUGGING
  if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=0, width=8, precision=5*/
#endif

  for (i = 1; i <= L; i++)
    {
      dpp   = dpc;                      
      dpc   = ox->dpf[do_full * i];     /* avoid conditional, use do_full as kronecker delta */
      rp    = om->rfv[dsq[i]];
      tp    = om->tfv;
      dcv   = _mm_setzero_ps();
      xEv   = _mm_setzero_ps();
      xBv   = _mm_set1_ps(xB);

      /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12.  Shift zeros on. */
      mpv   = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov);
      dpv   = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov);
      ipv   = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov);
      
      for (q = 0; q < Q; q++)
	{
	  /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */
	  sv   =                _mm_mul_ps(xBv, *tp);  tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	  sv   = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++;
	  sv   = _mm_mul_ps(sv, *rp);                  rp++;
	  xEv  = _mm_add_ps(xEv, sv);
	  
	  /* Load {MDI}(i-1,q) into mpv, dpv, ipv;
	   * {MDI}MX(q) is then the current, not the prev row
	   */
	  mpv = MMO(dpp,q);
	  dpv = DMO(dpp,q);
	  ipv = IMO(dpp,q);

	  /* Do the delayed stores of {MD}(i,q) now that memory is usable */
	  MMO(dpc,q) = sv;
	  DMO(dpc,q) = dcv;

	  /* Calculate the next D(i,q+1) partially: M->D only;
           * delay storage, holding it in dcv
	   */
	  dcv   = _mm_mul_ps(sv, *tp); tp++;

	  /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */
	  sv         =                _mm_mul_ps(mpv, *tp);  tp++;
	  IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++;
	}	  

      /* Now the DD paths. We would rather not serialize them but 
       * in an accurate Forward calculation, we have few options.
       */
      /* dcv has carried through from end of q loop above; store it 
       * in first pass, we add M->D and D->D path into DMX
       */
      /* We're almost certainly're obligated to do at least one complete 
       * DD path to be sure: 
       */
      dcv        = esl_sse_rightshift_ps(dcv, zerov);
      DMO(dpc,0) = zerov;
      tp         = om->tfv + 7*Q;	/* set tp to start of the DD's */
      for (q = 0; q < Q; q++) 
	{
	  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
	  dcv        = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */
	}

      /* now. on small models, it seems best (empirically) to just go
       * ahead and serialize. on large models, we can do a bit better,
       * by testing for when dcv (DD path) accrued to DMO(q) is below
       * machine epsilon for all q, in which case we know DMO(q) are all
       * at their final values. The tradeoff point is (empirically) somewhere around M=100,
       * at least on my desktop. We don't worry about the conditional here;
       * it's outside any inner loops.
       */
      if (om->M < 100)
	{			/* Fully serialized version */
	  for (j = 1; j < 4; j++)
	    {
	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      for (q = 0; q < Q; q++) 
		{ /* note, extend dcv, not DMO(q); only adding DD paths now */
		  DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q));	
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++; 
		}	    
	    }
	} 
      else
	{			/* Slightly parallelized version, but which incurs some overhead */
	  for (j = 1; j < 4; j++)
	    {
	      register __m128 cv;	/* keeps track of whether any DD's change DMO(q) */

	      dcv = esl_sse_rightshift_ps(dcv, zerov);
	      tp  = om->tfv + 7*Q;	/* set tp to start of the DD's */
	      cv  = zerov;
	      for (q = 0; q < Q; q++) 
		{ /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */
		  sv         = _mm_add_ps(dcv, DMO(dpc,q));	
		  cv         = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); 
		  DMO(dpc,q) = sv;	                                    /* store new DMO(q) */
		  dcv        = _mm_mul_ps(dcv, *tp);   tp++;            /* note, extend dcv, not DMO(q) */
		}	    
	      if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */
	    }
	}

      /* Add D's to xEv */
      for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv);

      /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */
      /* The following incantation is a horizontal sum of xEv's elements  */
      /* These must follow DD calculations, because D's contribute to E in Forward
       * (as opposed to Viterbi)
       */
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1)));
      xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2)));
      _mm_store_ss(&xE, xEv);

      xN =  xN * om->xf[p7O_N][p7O_LOOP];
      xC = (xC * om->xf[p7O_C][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_MOVE]);
      xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) +  (xE * om->xf[p7O_E][p7O_LOOP]);
      xB = (xJ * om->xf[p7O_J][p7O_MOVE]) +  (xN * om->xf[p7O_N][p7O_MOVE]);
      /* and now xB will carry over into next i, and xC carries over after i=L */

      /* Sparse rescaling. xE above threshold? trigger a rescaling event.            */
      if (xE > 1.0e4)	/* that's a little less than e^10, ~10% of our dynamic range */
	{
	  xN  = xN / xE;
	  xC  = xC / xE;
	  xJ  = xJ / xE;
	  xB  = xB / xE;
	  xEv = _mm_set1_ps(1.0 / xE);
	  for (q = 0; q < Q; q++)
	    {
	      MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv);
	      DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv);
	      IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv);
	    }
	  ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE;
	  ox->totscale += log(xE);
	  xE = 1.0;		
	}
      else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0;

      /* Storage of the specials.  We could've stored these already
       * but using xE, etc. variables makes it easy to convert this
       * code to O(M) memory versions just by deleting storage steps.
       */
      ox->xmx[i*p7X_NXCELLS+p7X_E] = xE;
      ox->xmx[i*p7X_NXCELLS+p7X_N] = xN;
      ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ;
      ox->xmx[i*p7X_NXCELLS+p7X_B] = xB;
      ox->xmx[i*p7X_NXCELLS+p7X_C] = xC;

#if p7_DEBUGGING
      if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC);	/* logify=TRUE, <rowi>=i, width=8, precision=5*/
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T, and flip total score back to log space (nats) */
  /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */
  /* On an underflow (which shouldn't happen), we counterintuitively return infinity:
   * the effect of this is to force the caller to rescore us with full range.
   */
  if       (isnan(xC))        ESL_EXCEPTION(eslERANGE, "forward score is NaN");
  else if  (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)");     /* if L==0, xC *should* be 0.0; J5/118 */
  else if  (isinf(xC) == 1)   ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)");

  if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]);
  return eslOK;
}
コード例 #19
0
	IntersectionData intersectRaySpheres(const Ray& ray, const vector<int>& spheresIndices,
			const Spheres& spheres)
	{
		const int maxSpheresToCheck = 4;
		IntersectionData result;
		result.intersection = false;
		result.tIntersection = numeric_limits<float>::max();

		int remainder = spheresIndices.size() % maxSpheresToCheck;
		bool canUseSIMD = (remainder < spheresIndices.size());

		int nonSIMDStartPos = 0;

		if(canUseSIMD)
		{
			const int spheresToSIMDCheck = spheresIndices.size() - remainder;
			nonSIMDStartPos = spheresToSIMDCheck;
			//Vec4Float a = _mm_set1_ps(1.f); when rayDir is normalized a is 1
			Vec4Float b = _mm_set1_ps(0.f);
			Vec4Float c = b;
			Vec4Float D = c;

			Vec4Float centerCoords[3], radiuses;

			for(int i = 0; i < spheresToSIMDCheck; i += 4)
			{
				for(int j = 0; j < 3; ++j)
				{
					centerCoords[j] = _mm_set_ps(
							spheres.centerCoords[j][spheresIndices[i]], spheres.centerCoords[j][spheresIndices[i + 1]],
							spheres.centerCoords[j][spheresIndices[i + 2]], spheres.centerCoords[j][spheresIndices[i + 3]]
					);

					radiuses = _mm_set_ps(
							spheres.radiuses[spheresIndices[i]], spheres.radiuses[spheresIndices[i + 1]],
							spheres.radiuses[spheresIndices[i + 2]], spheres.radiuses[spheresIndices[i + 2]]
					);

					b += 2.f * ray.direction.coords[j] * (ray.origin.coords[j] - centerCoords[j]);
					c += (ray.origin.coords[j] - centerCoords[j]) * (ray.origin.coords[j] - centerCoords[j]);
				}
				D = b * b - 4.f * c;

				Vec4Float mask = _mm_cmpge_ps(D, _mm_set_ps1(0.f));
				Vec4Float squareRootD = _mm_sqrt_ps(D);
				D = _mm_and_ps(squareRootD, mask);

				Vec4Float t1, t2;
				t1 = _mm_or_ps((-b - squareRootD) * 0.5f, _mm_andnot_ps(mask, D));
				t2 = _mm_or_ps((-b + squareRootD) * 0.5f, _mm_andnot_ps(mask, D));

				float tRes = result.tIntersection;
				for(int j = 0; j < 4; ++j)
				{
					if(t1[j] >= 0 && t1[j] < tRes)
					{
						tRes = t1[j];
					}
					if(t2[j] >= 0 && t2[j] < tRes)
					{
						tRes = t2[j];
					}
				}

				if(tRes	< result.tIntersection)

					result.intersection = true;
					result.tIntersection = tRes;
				}
			}

			for(int i = nonSIMDStartPos; i < spheresIndices.size(); ++i)
			{
				IntersectionData data;
				int idx = spheresIndices[i];
				Sphere sphere;
				sphere.center.x = spheres.centerCoords[0][idx];
				sphere.center.y = spheres.centerCoords[1][idx];
				sphere.center.z = spheres.centerCoords[2][idx];
				sphere.radius = spheres.radiuses[idx];
				data = intersectSingleSphere(ray, sphere);

				if(data.intersection && data.tIntersection < result.tIntersection)
				{
					result = data;
				}
			}

			return result;
	}
コード例 #20
0
ファイル: AABB.cpp プロジェクト: chengzg/MathGeoLib
bool AABB::IntersectLineAABB_SSE(const float4 &rayPos, const float4 &rayDir, float tNear, float tFar) const
{
	assume(rayDir.IsNormalized4());
	assume(tNear <= tFar && "AABB::IntersectLineAABB: User gave a degenerate line as input for the intersection test!");
	/* For reference, this is the C++ form of the vectorized SSE code below.

	float4 recipDir = rayDir.RecipFast4();
	float4 t1 = (aabbMinPoint - rayPos).Mul(recipDir);
	float4 t2 = (aabbMaxPoint - rayPos).Mul(recipDir);
	float4 near = t1.Min(t2);
	float4 far = t1.Max(t2);
	float4 rayDirAbs = rayDir.Abs();

	if (rayDirAbs.x > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.x, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.x, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.x < aabbMinPoint.x || rayPos.x > aabbMaxPoint.x) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.y > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.y, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.y, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.y < aabbMinPoint.y || rayPos.y > aabbMaxPoint.y) // early-out if the ray can't possibly enter the box.
		return false;

	if (rayDirAbs.z > 1e-4f) // ray is parallel to plane in question
	{
		tNear = Max(near.z, tNear); // tNear tracks distance to intersect (enter) the AABB.
		tFar = Min(far.z, tFar); // tFar tracks the distance to exit the AABB.
	}
	else if (rayPos.z < aabbMinPoint.z || rayPos.z > aabbMaxPoint.z) // early-out if the ray can't possibly enter the box.
		return false;

	return tNear < tFar;
	*/

	__m128 recipDir = _mm_rcp_ps(rayDir.v);
	// Note: The above performs an approximate reciprocal (11 bits of precision).
	// For a full precision reciprocal, perform a div:
//	__m128 recipDir = _mm_div_ps(_mm_set1_ps(1.f), rayDir.v);

	__m128 t1 = _mm_mul_ps(_mm_sub_ps(MinPoint_SSE(), rayPos.v), recipDir);
	__m128 t2 = _mm_mul_ps(_mm_sub_ps(MaxPoint_SSE(), rayPos.v), recipDir);

	__m128 nearD = _mm_min_ps(t1, t2); // [0 n3 n2 n1]
	__m128 farD = _mm_max_ps(t1, t2);  // [0 f3 f2 f1]

	// Check if the ray direction is parallel to any of the cardinal axes, and if so,
	// mask those [near, far] ranges away from the hit test computations.
	__m128 rayDirAbs = abs_ps(rayDir.v);

	const __m128 epsilon = _mm_set1_ps(1e-4f);
	// zeroDirections[i] will be nonzero for each axis i the ray is parallel to.
	__m128 zeroDirections = _mm_cmple_ps(rayDirAbs, epsilon);

	const __m128 floatInf = _mm_set1_ps(FLOAT_INF);
	const __m128 floatNegInf = _mm_set1_ps(-FLOAT_INF);

	// If the ray is parallel to one of the axes, replace the slab range for that axis
	// with [-inf, inf] range instead. (which is a no-op in the comparisons below)
	nearD = cmov_ps(nearD, floatNegInf, zeroDirections);
	farD = cmov_ps(farD , floatInf, zeroDirections);

	// Next, we need to compute horizontally max(nearD[0], nearD[1], nearD[2]) and min(farD[0], farD[1], farD[2])
	// to see if there is an overlap in the hit ranges.
	__m128 v1 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(0, 0, 0, 0)); // [f1 f1 n1 n1]
	__m128 v2 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(1, 1, 1, 1)); // [f2 f2 n2 n2]
	__m128 v3 = _mm_shuffle_ps(nearD, farD, _MM_SHUFFLE(2, 2, 2, 2)); // [f3 f3 n3 n3]
	nearD = _mm_max_ps(v1, _mm_max_ps(v2, v3));
	farD = _mm_min_ps(v1, _mm_min_ps(v2, v3));
	farD = _mm_shuffle_ps(farD, farD, _MM_SHUFFLE(3, 3, 3, 3)); // Unpack the result from high offset in the register.
	nearD = _mm_max_ps(nearD, _mm_set_ss(tNear));
	farD = _mm_min_ps(farD, _mm_set_ss(tFar));

	// Finally, test if the ranges overlap.
	__m128 rangeIntersects = _mm_cmple_ss(nearD, farD);

	// To store out out the interval of intersection, uncomment the following:
	// These are disabled, since without these, the whole function runs without a single memory store,
	// which has been profiled to be very fast! Uncommenting these causes an order-of-magnitude slowdown.
	// For now, using the SSE version only where the tNear and tFar ranges are not interesting.
//	_mm_store_ss(&tNear, nearD);
//	_mm_store_ss(&tFar, farD);

	// To avoid false positives, need to have an additional rejection test for each cardinal axis the ray direction
	// is parallel to.
	__m128 out2 = _mm_cmplt_ps(rayPos.v, MinPoint_SSE());
	__m128 out3 = _mm_cmpgt_ps(rayPos.v, MaxPoint_SSE());
	out2 = _mm_or_ps(out2, out3);
	zeroDirections = _mm_and_ps(zeroDirections, out2);

	__m128 yOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(1,1,1,1));
	__m128 zOut = _mm_shuffle_ps(zeroDirections, zeroDirections, _MM_SHUFFLE(2,2,2,2));

	zeroDirections = _mm_or_ps(_mm_or_ps(zeroDirections, yOut), zOut);
	// Intersection occurs if the slab ranges had positive overlap and if the test was not rejected by the ray being
	// parallel to some cardinal axis.
	__m128 intersects = _mm_andnot_ps(zeroDirections, rangeIntersects);
	__m128 epsilonMasked = _mm_and_ps(epsilon, intersects);
	return _mm_comieq_ss(epsilon, epsilonMasked) != 0;
}
コード例 #21
0
ファイル: esl_sse.c プロジェクト: ElofssonLab/TOPCONS2
/* Function:  esl_sse_logf()
 * Synopsis:  <r[z] = log x[z]>
 * Incept:    SRE, Fri Dec 14 11:32:54 2007 [Janelia]
 *
 * Purpose:   Given a vector <x> containing four floats, returns a
 *            vector <r> in which each element <r[z] = logf(x[z])>.
 *            
 *            Valid in the domain $x_z > 0$ for normalized IEEE754
 *            $x_z$.
 *
 *            For <x> $< 0$, including -0, returns <NaN>. For <x> $==
 *            0$ or subnormal <x>, returns <-inf>. For <x = inf>,
 *            returns <inf>. For <x = NaN>, returns <NaN>. For 
 *            subnormal <x>, returns <-inf>.
 *
 * Xref:      J2/71.
 * 
 * Note:      Derived from an SSE1 implementation by Julian
 *            Pommier. Converted to SSE2 and added handling
 *            of IEEE754 specials.
 */
__m128 
esl_sse_logf(__m128 x) 
{
  static float cephes_p[9] = {  7.0376836292E-2f, -1.1514610310E-1f,  1.1676998740E-1f,
				-1.2420140846E-1f, 1.4249322787E-1f, -1.6668057665E-1f,
				2.0000714765E-1f, -2.4999993993E-1f,  3.3333331174E-1f };
  __m128  onev = _mm_set1_ps(1.0f);          /* all elem = 1.0 */
  __m128  v0p5 = _mm_set1_ps(0.5f);          /* all elem = 0.5 */
  __m128i vneg = _mm_set1_epi32(0x80000000); /* all elem have IEEE sign bit up */
  __m128i vexp = _mm_set1_epi32(0x7f800000); /* all elem have IEEE exponent bits up */
  __m128i ei;
  __m128  e;
  __m128  invalid_mask, zero_mask, inf_mask;            /* masks used to handle special IEEE754 inputs */
  __m128  mask;
  __m128  origx;
  __m128  tmp;
  __m128  y;
  __m128  z;

  /* first, split x apart: x = frexpf(x, &e); */
  ei           = _mm_srli_epi32( _mm_castps_si128(x), 23);	                                        /* shift right 23: IEEE754 floats: ei = biased exponents     */
  invalid_mask = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vneg), vneg));  /* mask any elem that's negative; these become NaN           */
  zero_mask    = _mm_castsi128_ps ( _mm_cmpeq_epi32(ei, _mm_setzero_si128()));                          /* mask any elem zero or subnormal; these become -inf        */
  inf_mask     = _mm_castsi128_ps ( _mm_cmpeq_epi32( _mm_and_si128(_mm_castps_si128(x), vexp), vexp));  /* mask any elem inf or NaN; log(inf)=inf, log(NaN)=NaN      */
  origx        = x;			                                                                /* store original x, used for log(inf) = inf, log(NaN) = NaN */

  x  = _mm_and_ps(x, _mm_castsi128_ps(_mm_set1_epi32(~0x7f800000))); /* x now the stored 23 bits of the 24-bit significand        */
  x  = _mm_or_ps (x, v0p5);                                          /* sets hidden bit b[0]                                      */

  ei = _mm_sub_epi32(ei, _mm_set1_epi32(126));                       /* -127 (ei now signed base-2 exponent); then +1             */
  e  = _mm_cvtepi32_ps(ei);

  /* now, calculate the log */
  mask = _mm_cmplt_ps(x, _mm_set1_ps(0.707106781186547524f)); /* avoid conditional branches.           */
  tmp  = _mm_and_ps(x, mask);	                              /* tmp contains x values < 0.707, else 0 */
  x    = _mm_sub_ps(x, onev);
  e    = _mm_sub_ps(e, _mm_and_ps(onev, mask));
  x    = _mm_add_ps(x, tmp);
  z    = _mm_mul_ps(x,x);

  y =               _mm_set1_ps(cephes_p[0]);    y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[1]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[2]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[3]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[4]));   y = _mm_mul_ps(y, x);    
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[5]));   y = _mm_mul_ps(y, x);   
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[6]));   y = _mm_mul_ps(y, x); 
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[7]));   y = _mm_mul_ps(y, x);  
  y = _mm_add_ps(y, _mm_set1_ps(cephes_p[8]));   y = _mm_mul_ps(y, x);
  y = _mm_mul_ps(y, z);

  tmp = _mm_mul_ps(e, _mm_set1_ps(-2.12194440e-4f));
  y   = _mm_add_ps(y, tmp);

  tmp = _mm_mul_ps(z, v0p5);
  y   = _mm_sub_ps(y, tmp);

  tmp = _mm_mul_ps(e, _mm_set1_ps(0.693359375f));
  x = _mm_add_ps(x, y);
  x = _mm_add_ps(x, tmp);

  /* IEEE754 cleanup: */
  x = esl_sse_select_ps(x, origx,                     inf_mask);  /* log(inf)=inf; log(NaN)      = NaN  */
  x = _mm_or_ps(x, invalid_mask);                                 /* log(x<0, including -0,-inf) = NaN  */
  x = esl_sse_select_ps(x, _mm_set1_ps(-eslINFINITY), zero_mask); /* x zero or subnormal         = -inf */
  return x;
}
コード例 #22
0
static void OverdriveAndSuppressSSE2(AecCore* aec,
                                     float hNl[PART_LEN1],
                                     const float hNlFb,
                                     float efw[2][PART_LEN1]) {
  int i;
  const __m128 vec_hNlFb = _mm_set1_ps(hNlFb);
  const __m128 vec_one = _mm_set1_ps(1.0f);
  const __m128 vec_minus_one = _mm_set1_ps(-1.0f);
  const __m128 vec_overDriveSm = _mm_set1_ps(aec->overDriveSm);
  // vectorized code (four at once)
  for (i = 0; i + 3 < PART_LEN1; i += 4) {
    // Weight subbands
    __m128 vec_hNl = _mm_loadu_ps(&hNl[i]);
    const __m128 vec_weightCurve = _mm_loadu_ps(&WebRtcAec_weightCurve[i]);
    const __m128 bigger = _mm_cmpgt_ps(vec_hNl, vec_hNlFb);
    const __m128 vec_weightCurve_hNlFb = _mm_mul_ps(vec_weightCurve, vec_hNlFb);
    const __m128 vec_one_weightCurve = _mm_sub_ps(vec_one, vec_weightCurve);
    const __m128 vec_one_weightCurve_hNl =
        _mm_mul_ps(vec_one_weightCurve, vec_hNl);
    const __m128 vec_if0 = _mm_andnot_ps(bigger, vec_hNl);
    const __m128 vec_if1 = _mm_and_ps(
        bigger, _mm_add_ps(vec_weightCurve_hNlFb, vec_one_weightCurve_hNl));
    vec_hNl = _mm_or_ps(vec_if0, vec_if1);

    {
      const __m128 vec_overDriveCurve =
          _mm_loadu_ps(&WebRtcAec_overDriveCurve[i]);
      const __m128 vec_overDriveSm_overDriveCurve =
          _mm_mul_ps(vec_overDriveSm, vec_overDriveCurve);
      vec_hNl = mm_pow_ps(vec_hNl, vec_overDriveSm_overDriveCurve);
      _mm_storeu_ps(&hNl[i], vec_hNl);
    }

    // Suppress error signal
    {
      __m128 vec_efw_re = _mm_loadu_ps(&efw[0][i]);
      __m128 vec_efw_im = _mm_loadu_ps(&efw[1][i]);
      vec_efw_re = _mm_mul_ps(vec_efw_re, vec_hNl);
      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_hNl);

      // Ooura fft returns incorrect sign on imaginary component. It matters
      // here because we are making an additive change with comfort noise.
      vec_efw_im = _mm_mul_ps(vec_efw_im, vec_minus_one);
      _mm_storeu_ps(&efw[0][i], vec_efw_re);
      _mm_storeu_ps(&efw[1][i], vec_efw_im);
    }
  }
  // scalar code for the remaining items.
  for (; i < PART_LEN1; i++) {
    // Weight subbands
    if (hNl[i] > hNlFb) {
      hNl[i] = WebRtcAec_weightCurve[i] * hNlFb +
               (1 - WebRtcAec_weightCurve[i]) * hNl[i];
    }
    hNl[i] = powf(hNl[i], aec->overDriveSm * WebRtcAec_overDriveCurve[i]);

    // Suppress error signal
    efw[0][i] *= hNl[i];
    efw[1][i] *= hNl[i];

    // Ooura fft returns incorrect sign on imaginary component. It matters
    // here because we are making an additive change with comfort noise.
    efw[1][i] *= -1;
  }
}
コード例 #23
0
ファイル: nx_sse_float.hpp プロジェクト: mywoodstock/nxsimd
 inline vector4f operator|(const vector4f& lhs, const vector4f& rhs)
 {
     return _mm_or_ps(lhs, rhs);
 }