C++ (Cpp) _mm_mul_ps примеры использования

Пример #1

0

Показать файл

Файл: sse-mulps-1.c Проект: pjump/gcc

test (__m128 s1, __m128 s2)
{
    return _mm_mul_ps (s1, s2);
}

Пример #2

0

Показать файл

Файл: nb_kernel_ElecNone_VdwLJSh_GeomP1P1_sse4_1_single.c Проект: daniellandau/gromacs

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    nvdwtype         = fr->ntype;
    vdwparam         = fr->nbfp;
    vdwtype          = mdatoms->typeA;

    rcutoff_scalar   = fr->rvdw;
    rcutoff          = _mm_set1_ps(rcutoff_scalar);
    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);

    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
    rvdw             = _mm_set1_ps(fr->rvdw);

    /* Avoid stupid compiler warnings */
    jnrA = jnrB = jnrC = jnrD = 0;
    j_coord_offsetA = 0;
    j_coord_offsetB = 0;
    j_coord_offsetC = 0;
    j_coord_offsetD = 0;

    outeriter        = 0;
    inneriter        = 0;

    for(iidx=0;iidx<4*DIM;iidx++)

Пример #3

0

Показать файл

Файл: seismic.c Проект: agremm/Seismic

// function that implements the kernel of the seismic modeling algorithm
void seismic_exec(float **VEL, float **PPF, float **APF, float **NPF, float* seismicPulseVector, int spPosX, int spPosY, int xDim, int yDim, int timeSteps)
{
  int i,j;  // spatial loops counters
  int t;    // time loop counter
  #ifdef _VERBOSE
    int progressTimer = -1;
  #endif

  // make sure packing _all_ the data into sets of 4 element is ok
  assert( xDim % 4 == 0 );

  #ifdef _VERBOSE
    printf("processing...\n");
    printf("point of explosion = %d, %d\n", spPosX, spPosY);
  #endif

  // there are 16 XMM registers in 64 bit mode, so there is no need to spill to stack
  __m128 s_ppf, s_vel, s_actual, s_above1, s_left1, s_under1, s_right1, s_two, s_sixteen, s_sixty;
  __m128 s_above2, s_under2, s_left2, s_right2;

  float two[4] = {2.0f, 2.0f, 2.0f, 2.0f };
  float sixteen[4] = {16.0f,16.0f,16.0f,16.0f};
  float sixty[4] = {60.f,60.f,60.f,60.f};

  // preload XMM registers with constant values.
  s_two = _mm_load_ps( two );
  s_sixteen = _mm_load_ps( sixteen );
  s_sixty = _mm_load_ps( sixty );

  // time loop
  for (t = 0; t < timeSteps; t++)
  {
    #ifdef _VVERBOSE
      printf("----------------------------------------------\ntimestep: %d\n\n", t );
    #endif

    // add pulse
    APF[spPosY][spPosX] += seismicPulseVector[t];

    for(i=2; i<(yDim-2); i++)
    {
      for(j=2 + ALIGNMENT_OFFSET; j<(xDim-2); j+=4)
      {
        s_ppf = _mm_load_ps( &(PPF[i][j]) );
        s_vel = _mm_load_ps( &(VEL[i][j]) );
        s_actual = _mm_load_ps( &(APF[i][j]) );

        s_left1 = _mm_load_ps( &(APF[i-1][j]) );
        s_left2 = _mm_load_ps( &(APF[i-2][j]) );
        s_right2 = _mm_load_ps( &(APF[i+2][j]) );
        s_right1 = _mm_load_ps( &(APF[i+1][j]) );
        s_above1 = _mm_loadu_ps( &(APF[i][j-1]) );
        s_under1 = _mm_loadu_ps( &(APF[i][j+1]) );
        s_above2 = _mm_loadl_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(1, 0, 0, 0)),
                                &(APF[i][j-2]));

        s_under2 = _mm_loadh_pi( _mm_shuffle_ps(s_actual, s_actual, _MM_SHUFFLE(0, 0, 3, 2)),
                                &(APF[i][j+4]));

        // sum elements with an offset of one
        s_under1 = _mm_add_ps( s_under1, _mm_add_ps( s_above1, _mm_add_ps( s_left1, s_right1)));
        // sum elements with an offset of two
        s_above2 = _mm_add_ps( s_left2, _mm_add_ps( s_right2, _mm_add_ps( s_under2, s_above2)));
        // multiply with 16
        s_under1 = _mm_mul_ps( s_sixteen, s_under1 );
        //
        s_under1 = _mm_sub_ps( _mm_sub_ps( s_under1,  s_above2), _mm_mul_ps( s_sixty, s_actual ) );
        s_under1 = _mm_add_ps( _mm_mul_ps( s_vel, s_under1), _mm_sub_ps(_mm_mul_ps( s_two, s_actual ), s_ppf) );

        // save the result
        _mm_store_ps( &(NPF[i][j]), s_under1);

        #ifdef _VVERBOSE
          printf("[%d][%d]\n", i, j);
        #endif
      }
      #ifdef _VVERBOSE
        printf("\n");
      #endif
    }

    #ifdef _VERBOSE
//     shows one # at each 10% of the total processing time
      if (t/(timeSteps/10) > progressTimer )
      {
        printf("#");
        progressTimer++;
        fflush(stdout);
      }
    #endif

    // switch pointers instead of copying data
    PPF = APF;
    APF = NPF;
    NPF = PPF;
  }

  #ifdef _VERBOSE
    printf("\nend process!\n");
  #endif
}

Пример #4

0

Показать файл

Файл: nb_kernel_ElecRF_VdwLJ_GeomW3P1_avx_128_fma_single.c Проект: dkarkoulis/gromacs

    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->epsfac);
    charge           = mdatoms->chargeA;
    krf              = _mm_set1_ps(fr->ic->k_rf);
    krf2             = _mm_set1_ps(fr->ic->k_rf*2.0);
    crf              = _mm_set1_ps(fr->ic->c_rf);
    nvdwtype         = fr->ntype;
    vdwparam         = fr->nbfp;
    vdwtype          = mdatoms->typeA;

    /* Setup water-specific parameters */
    inr              = nlist->iinr[0];
    iq0              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+0]));
    iq1              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+1]));
    iq2              = _mm_mul_ps(facel,_mm_set1_ps(charge[inr+2]));
    vdwioffset0      = 2*nvdwtype*vdwtype[inr+0];

    /* Avoid stupid compiler warnings */
    jnrA = jnrB = jnrC = jnrD = 0;
    j_coord_offsetA = 0;
    j_coord_offsetB = 0;
    j_coord_offsetC = 0;
    j_coord_offsetD = 0;

    outeriter        = 0;
    inneriter        = 0;

    for(iidx=0;iidx<4*DIM;iidx++)

Пример #5

0

Показать файл

Файл: temperature.c Проект: angryziber/darktable

void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const int filters = dt_image_flipped_filter(&piece->pipe->image);
  dt_iop_temperature_data_t *d = (dt_iop_temperature_data_t *)piece->data;
  if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp != 4)
  {
    const float coeffsi[3] = {d->coeffs[0]/65535.0f, d->coeffs[1]/65535.0f, d->coeffs[2]/65535.0f};
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static)
#endif
    for(int j=0; j<roi_out->height; j++)
    {
      int i=0;
      const uint16_t *in = ((uint16_t *)ivoid) + j*roi_out->width;
      float *out = ((float*)ovoid) + j*roi_out->width;

      // process unaligned pixels
      for ( ; i < ((4-(j*roi_out->width & 3)) & 3) ; i++,out++,in++)
        *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)];

      const __m128 coeffs = _mm_set_ps(coeffsi[FC(j+roi_out->y, roi_out->x+i+3, filters)],
                                       coeffsi[FC(j+roi_out->y, roi_out->x+i+2, filters)],
                                       coeffsi[FC(j+roi_out->y, roi_out->x+i+1, filters)],
                                       coeffsi[FC(j+roi_out->y, roi_out->x+i  , filters)]);

      // process aligned pixels with SSE
      for( ; i < roi_out->width - 3 ; i+=4,out+=4,in+=4)
      {
        _mm_stream_ps(out,_mm_mul_ps(coeffs,_mm_set_ps(in[3],in[2],in[1],in[0])));
      }

      // process the rest
      for( ; i<roi_out->width; i++,out++,in++)
        *out = *in * coeffsi[FC(j+roi_out->y, i+roi_out->x, filters)];
    }
    _mm_sfence();
  }
  else if(!dt_dev_pixelpipe_uses_downsampled_input(piece->pipe) && filters && piece->pipe->image.bpp == 4)
  {
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static)
#endif
    for(int j=0; j<roi_out->height; j++)
    {
      const float *in = ((float *)ivoid) + j*roi_out->width;
      float *out = ((float*)ovoid) + j*roi_out->width;
      for(int i=0; i<roi_out->width; i++,out++,in++)
        *out = *in * d->coeffs[FC(j+roi_out->x, i+roi_out->y, filters)];
    }
  }
  else
  {
    const int ch = piece->colors;
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out, ivoid, ovoid, d) schedule(static)
#endif
    for(int k=0; k<roi_out->height; k++)
    {
      const float *in = ((float*)ivoid) + ch*k*roi_out->width;
      float *out = ((float*)ovoid) + ch*k*roi_out->width;
      for (int j=0; j<roi_out->width; j++,in+=ch,out+=ch)
        for(int c=0; c<3; c++) out[c] = in[c]*d->coeffs[c];
    }
  }
  for(int k=0; k<3; k++)
    piece->pipe->processed_maximum[k] = d->coeffs[k] * piece->pipe->processed_maximum[k];
}

Пример #6

0

Показать файл

Файл: BM3D_Final.cpp Проект: taopanpan/VapourSynth-BM3D

void BM3D_Final_Process::CollaborativeFilter(int plane,
    FLType *ResNum, FLType *ResDen,
    const FLType *src, const FLType *ref,
    const PosPairCode &code) const
{
    PCType GroupSize = static_cast<PCType>(code.size());
    // When para.GroupSize > 0, limit GroupSize up to para.GroupSize
    if (d.para.GroupSize > 0 && GroupSize > d.para.GroupSize)
    {
        GroupSize = d.para.GroupSize;
    }

    // Construct source group and reference group guided by matched pos code
    block_group srcGroup(src, src_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize);
    block_group refGroup(ref, ref_stride[plane], code, GroupSize, d.para.BlockSize, d.para.BlockSize);

    // Initialize L2-norm of Wiener coefficients
    FLType L2Wiener = 0;

    // Apply forward 3D transform to the source group and the reference group
    d.f[plane].fp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());
    d.f[plane].fp[GroupSize - 1].execute_r2r(refGroup.data(), refGroup.data());

    // Apply empirical Wiener filtering to the source group guided by the reference group
    const FLType sigmaSquare = d.f[plane].wienerSigmaSqr[GroupSize - 1];

    auto srcp = srcGroup.data();
    auto refp = refGroup.data();
    const auto upper = srcp + srcGroup.size();

#if defined(__SSE2__)
    static const ptrdiff_t simd_step = 4;
    const ptrdiff_t simd_residue = srcGroup.size() % simd_step;
    const ptrdiff_t simd_width = srcGroup.size() - simd_residue;

    const __m128 sgm_sqr = _mm_set_ps1(sigmaSquare);
    __m128 l2wiener_sum = _mm_setzero_ps();

    for (const auto upper1 = srcp + simd_width; srcp < upper1; srcp += simd_step, refp += simd_step)
    {
        const __m128 s1 = _mm_load_ps(srcp);
        const __m128 r1 = _mm_load_ps(refp);
        const __m128 r1sqr = _mm_mul_ps(r1, r1);

        const __m128 wiener = _mm_mul_ps(r1sqr, _mm_rcp_ps(_mm_add_ps(r1sqr, sgm_sqr)));

        const __m128 d1 = _mm_mul_ps(s1, wiener);
        _mm_store_ps(srcp, d1);
        l2wiener_sum = _mm_add_ps(l2wiener_sum, _mm_mul_ps(wiener, wiener));
    }

    alignas(16) FLType l2wiener_sum_f32[4];
    _mm_store_ps(l2wiener_sum_f32, l2wiener_sum);
    L2Wiener += l2wiener_sum_f32[0] + l2wiener_sum_f32[1] + l2wiener_sum_f32[2] + l2wiener_sum_f32[3];
#endif

    for (; srcp < upper; ++srcp, ++refp)
    {
        const FLType refSquare = *refp * *refp;
        const FLType wienerCoef = refSquare / (refSquare + sigmaSquare);
        *srcp *= wienerCoef;
        L2Wiener += wienerCoef * wienerCoef;
    }

    // Apply backward 3D transform to the filtered group
    d.f[plane].bp[GroupSize - 1].execute_r2r(srcGroup.data(), srcGroup.data());

    // Calculate weight for the filtered group
    // Also include the normalization factor to compensate for the amplification introduced in 3D transform
    FLType denWeight = L2Wiener <= 0 ? 1 : FLType(1) / L2Wiener;
    FLType numWeight = static_cast<FLType>(denWeight / d.f[plane].finalAMP[GroupSize - 1]);

    // Store the weighted filtered group to the numerator part of the final estimation
    // Store the weight to the denominator part of the final estimation
    srcGroup.AddTo(ResNum, dst_stride[plane], numWeight);
    srcGroup.CountTo(ResDen, dst_stride[plane], denWeight);
}

Пример #7

0

Показать файл

Файл: DXLightEngine.cpp Проект: PlutoniumHeart/VS2012FFalcon

// This function switch on the nearest lights to an object of a certain radius
void	CDXLight::UpdateDynamicLights(DWORD ID, D3DVECTOR *pos, float Radius)
{
	_MM_ALIGN16	XMMVector	Pos, Acc, Square;
	DWORD					idx,ldx, LightsInList=0;
	float					Dist, MaxDist=0.0f;

	// setup the position for XMM use
	*(D3DVECTOR*)&Pos.d3d=*pos;

	// if 1st call in this rendering
	if(!LightsLoaded){
		// Enable the lights + 1 ( light #0 is the Sun )
		idx=0;
//		while(idx<MAX_DYNAMIC_LIGHTS && LightList[idx].CameraDistance<=DYNAMIC_LIGHT_INSIDE_RANGE){
		while(idx<DynamicLights){
			m_pD3DD->SetLight(idx+1, &LightList[idx++].Light);
		}
		LightsLoaded=true;
	}
	
	//reset object own light list
	for(idx=0; idx<MAX_SAMETIME_LIGHTS; idx++){
		SwitchedList[idx].Distance=DYNAMIC_LIGHT_INSIDE_RANGE+1.0f;
		SwitchedList[idx].Index=0;
	}

	//for each light in list setup a distance list
	idx=0;
	//while(idx<MAX_DYNAMIC_LIGHTS && LightList[idx].CameraDistance<=DYNAMIC_LIGHT_INSIDE_RANGE){
	while(idx<DynamicLights){
		
		// Light defaults to OFF
		LightsToOn[idx]=false;

		// if this is a light illuminating ONLY THE OWNER, and we r not the owners, skip
		if(LightList[idx].Flags.OwnLight && LightList[idx].LightID!=ID){
			idx++;
			continue;
		}

		// if this is a light illuminating NOT THE OWNER, and we r the owners, skip
		if(LightList[idx].Flags.NotSelfLight && LightList[idx].LightID==ID){
			idx++;
			continue;
		}

		// get Vectors distance on X/Y/Z Axis and Square it for incoming use
		Acc.Xmm=_mm_sub_ps(Pos.Xmm, LightList[idx].Pos.Xmm);
		Square.Xmm=_mm_mul_ps(Acc.Xmm, Acc.Xmm);
		// Get the Distance
		Dist=sqrtf(Square.d3d.x+Square.d3d.y+Square.d3d.z);

		// If Object out of the Light range
		if((Dist-Radius)>LightList[idx].Light.dvRange)
		{idx++; continue; }
		
		// Calculation for SPOT LIGHTs cone
		if(LightList[idx].Light.dltType==D3DLIGHT_SPOT){

			// Get the Distance btw Light & Object
			//float	dx=Pos.d3d.x-LightList[idx].Light.dvPosition.x;
			//float	dy=Pos.d3d.y-LightList[idx].Light.dvPosition.y;
			//float	dz=Pos.d3d.z-LightList[idx].Light.dvPosition.z;
			// Calculate Horizontal and Vertical Angle btw Light & Object
			float	ax=atan2(Acc.d3d.x, Acc.d3d.y);
			float	ay=atan2(Acc.d3d.z,sqrtf(Square.d3d.x+Square.d3d.y));
			// transform the angles in same Sign Domain of the light angles X/Y
			if(fabs(ax-LightList[idx].alphaX)>PI) ax+=ax>LightList[idx].alphaX ? -2*PI : 2*PI;
			if(fabs(ay-LightList[idx].alphaY)>PI) ay+=ax>LightList[idx].alphaY ? -2*PI : 2*PI;
			float lPhy=LightList[idx].phi;
			float laX=LightList[idx].alphaX;
			float laY=LightList[idx].alphaY;
			// Calculate the Angular Delta given by Object Radius
			float	dPhi=atanf(Radius/Dist);

#ifdef	LIGHT_ENGINE_DEBUG
			REPORT_VALUE("Max :",(int)(LightList[idx].alphaX*180/PI));
			REPORT_VALUE("Min :",(int)(LightList[idx].alphaY*180/PI));
#ifdef DEBUG_LOD_ID
			sprintf(TheLODNames[gDebugLodID], "%3.1f %3.1f", ax*180/PI, dPhi*180/PI);
#endif
#endif
			if(ax-dPhi>(laX+lPhy) || ax+dPhi<(laX-lPhy) || ay-dPhi>(laY+lPhy) || ay+dPhi<(laY-lPhy))
			{ idx++; continue; }


		}
		Dist-=Radius;

		// if List still has a slot, add immediatly the light
		if(LightsInList<MAX_SAMETIME_LIGHTS){
			// setup new distance and index in the list
			SwitchedList[LightsInList].Distance=Dist;
			SwitchedList[LightsInList++].Index=idx;
			// flag the light as going to be switched on
			LightsToOn[idx]=true;
			// and update the longest in list
			if(Dist>MaxDist) MaxDist=Dist;
		} else {
			// else only if more near than any light in list
			if(Dist<MaxDist){
				// setup a temporary for the new Max Distance 
				float	TempDist=0.0f;
				// check if lower distance than any in the object lites list
				for(ldx=0; ldx<MAX_SAMETIME_LIGHTS; ldx++){
					// if distance is less
					if(SwitchedList[ldx].Distance==MaxDist){
						// disable the previous light
						LightsToOn[SwitchedList[ldx].Index]=false;
						// setup new distance and index in the list
						SwitchedList[ldx].Distance=Dist;
						SwitchedList[ldx].Index=idx;
						// enable the new light
						LightsToOn[idx]=true;
					}
					// check if new longest distance
					if(SwitchedList[ldx].Distance>TempDist) TempDist=SwitchedList[ldx].Distance;
				}
				// update new Max Distance
				MaxDist=TempDist;
			}
		}
		idx++;
	}

	EnableMappedLights();
}

Пример #8

0

Показать файл

Файл: nb_kernel_ElecEwSh_VdwLJEwSh_GeomP1P1_sse4_1_single.cpp Проект: friforever/gromacs

    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    facel            = _mm_set1_ps(fr->ic->epsfac);
    charge           = mdatoms->chargeA;
    nvdwtype         = fr->ntype;
    vdwparam         = fr->nbfp;
    vdwtype          = mdatoms->typeA;
    vdwgridparam     = fr->ljpme_c6grid;
    sh_lj_ewald	     = _mm_set1_ps(fr->ic->sh_lj_ewald);
    ewclj            = _mm_set1_ps(fr->ic->ewaldcoeff_lj);
    ewclj2	     = _mm_mul_ps(minus_one,_mm_mul_ps(ewclj,ewclj));

    sh_ewald         = _mm_set1_ps(fr->ic->sh_ewald);
    ewtab            = fr->ic->tabq_coul_FDV0;
    ewtabscale       = _mm_set1_ps(fr->ic->tabq_scale);
    ewtabhalfspace   = _mm_set1_ps(0.5/fr->ic->tabq_scale);

    /* When we use explicit cutoffs the value must be identical for elec and VdW, so use elec as an arbitrary choice */
    rcutoff_scalar   = fr->ic->rcoulomb;
    rcutoff          = _mm_set1_ps(rcutoff_scalar);
    rcutoff2         = _mm_mul_ps(rcutoff,rcutoff);

    sh_vdw_invrcut6  = _mm_set1_ps(fr->ic->sh_invrc6);
    rvdw             = _mm_set1_ps(fr->ic->rvdw);

    /* Avoid stupid compiler warnings */

Пример #9

0

Показать файл

Файл: PresetFrameIO.cpp Проект: projectM-visualizer/projectm

void PresetOutputs::PerPixelMath_sse(const PipelineContext &context)
{
	for (int x = 0; x < gx; x++)
	{
		for (int y = 0; y < gy; y += 4)
		{
			// fZoom2 = std::pow(this->zoom_mesh[x][y], std::pow(this->zoomexp_mesh[x][y],
			// 		rad_mesh[x][y] * 2.0f - 1.0f));
			__m128 rad_mesh_scaled =
				_mm_sub_ps(
					_mm_mul_ps(
						_mm_load_ps(&this->rad_mesh[x][y]),
						_mm_set_ps1(2.0f)),
					_mm_set_ps1(1.0f));
            __m128 zoom_mesh2 = _mm_load_ps(&this->zoom_mesh[x][y]);
            __m128 zoomexp_mesh2 = _mm_load_ps(&this->zoomexp_mesh[x][y]);
            __m128 fZoom2 = _mm_pow(zoom_mesh2, _mm_pow(zoomexp_mesh2, rad_mesh_scaled));
			// fZoom2Inv = 1.0f / fZoom2;
			__m128 fZoomInv = _mm_rcp_ps(fZoom2);

			// this->x_mesh[x][y] = this->orig_x[x][y] * 0.5f * fZoom2Inv + 0.5f;
            __m128 x_mesh2 =
				_mm_add_ps(
					_mm_mul_ps(
						_mm_load_ps(&this->orig_x[x][y]),
						_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),		// CONSIDER: common sub-expression
					_mm_set_ps1(0.5f));
			// this->x_mesh[x][y] = (this->x_mesh[x][y] - this->cx_mesh[x][y]) / this->sx_mesh[x][y] + this->cx_mesh[x][y];
            __m128 cx_mesh2 = _mm_load_ps(&this->cx_mesh[x][y]);
            __m128 sx_mesh2 = _mm_load_ps(&this->sx_mesh[x][y]);
			_mm_store_ps(&this->x_mesh[x][y],
				_mm_add_ps(
					_mm_div_ps(
                        _mm_sub_ps(x_mesh2,cx_mesh2),
                        sx_mesh2),
                    cx_mesh2
				));

			// this->y_mesh[x][y] = this->orig_y[x][y] * 0.5f * fZoom2Inv + 0.5f;
            __m128 y_mesh2 =
				_mm_add_ps(
					_mm_mul_ps(
						_mm_load_ps(&this->orig_y[x][y]),
						_mm_mul_ps(fZoomInv,_mm_set_ps1(0.5f))),
					_mm_set_ps1(0.5f));
			// this->y_mesh[x][y] = (this->y_mesh[x][y] - this->cy_mesh[x][y]) / this->sy_mesh[x][y] + this->cy_mesh[x][y];
            __m128 cy_mesh2 = _mm_load_ps(&this->cy_mesh[x][y]);
            __m128 sy_mesh2 = _mm_load_ps(&this->sy_mesh[x][y]);
			_mm_store_ps(&this->y_mesh[x][y],
				_mm_add_ps(
					_mm_div_ps(
                        _mm_sub_ps(y_mesh2,cy_mesh2),
                        sy_mesh2),
                    cy_mesh2
				));
		}
	}

	const float fWarpTime = context.time * this->fWarpAnimSpeed;
	const float fWarpScaleInv = 1.0f / this->fWarpScale;
	const float f[4] =
	{
		11.68f + 4.0f * cosf(fWarpTime * 1.413f + 10),
		 8.77f + 3.0f * cosf(fWarpTime * 1.113f + 7),
		10.54f + 3.0f * cosf(fWarpTime * 1.233f + 3),
		11.49f + 4.0f * cosf(fWarpTime * 0.933f + 5)
	};

	for (int x = 0; x < gx; x++)
	{
		for (int y = 0; y < gy; y+=4)
		{
			//float orig_x = this->orig_x[x][y];
			//float orig_y = this->orig_y[x][y];
			//float warp_mesh = this->warp_mesh[x][y] * 0.0035f;
            const __m128 orig_x2 = _mm_load_ps(&this->orig_x[x][y]);
            const __m128 orig_y2 = _mm_load_ps(&this->orig_y[x][y]);
            const __m128 warp_mesh2 = _mm_mul_ps(_mm_load_ps(&this->warp_mesh[x][y]), _mm_set_ps1(0.0035f));

			// this->x_mesh[x][y] +=
			// 	(warp_mesh * sinf(fWarpTime * 0.333f + fWarpScaleInv * (orig_x * f[0] - orig_y * f[3]))) +
			// 	(warp_mesh * cosf(fWarpTime * 0.753f - fWarpScaleInv * (orig_x * f[1] - orig_y * f[2])));
			_mm_store_ps(&this->x_mesh[x][y],
				_mm_add_ps(_mm_load_ps(&this->x_mesh[x][y]),
					_mm_add_ps(
                        _mm_mul_ps(warp_mesh2, _mm_sinf(
							_mm_add_ps(
								_mm_set_ps1(fWarpTime*0.333f),
								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
									_mm_sub_ps(
                                        _mm_mul_ps(orig_x2, _mm_set_ps1(f[0])),
                                        _mm_mul_ps(orig_y2, _mm_set_ps1(f[3]))
									))))),
                        _mm_mul_ps(warp_mesh2, _mm_cosf(
							_mm_sub_ps(
								_mm_set_ps1(fWarpTime*0.753f),
								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
									_mm_sub_ps(
                                        _mm_mul_ps(orig_x2, _mm_set_ps1(f[1])),
                                        _mm_mul_ps(orig_y2, _mm_set_ps1(f[2]))
									))))))));

			// this->y_mesh[x][y] +=
			// 	(warp_mesh * cosf(fWarpTime * 0.375f - fWarpScaleInv * (orig_x * f[2] + orig_y * f[1]))) +
			// 	(warp_mesh * sinf(fWarpTime * 0.825f + fWarpScaleInv * (orig_x * f[0] + orig_y * f[3])));
			_mm_store_ps(&this->y_mesh[x][y],
				_mm_add_ps(_mm_load_ps(&this->y_mesh[x][y]),
					_mm_add_ps(
                        _mm_mul_ps(warp_mesh2, _mm_cosf(
							_mm_sub_ps(
								_mm_set_ps1(fWarpTime*0.375f),
								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
									_mm_add_ps(
                                        _mm_mul_ps(orig_x2, _mm_set_ps1(f[2])),
                                        _mm_mul_ps(orig_y2, _mm_set_ps1(f[1]))
									))))),
                        _mm_mul_ps(warp_mesh2, _mm_sinf(
							_mm_add_ps(
								_mm_set_ps1(fWarpTime*0.825f),
								_mm_mul_ps(_mm_set_ps1(fWarpScaleInv),
									_mm_add_ps(
                                        _mm_mul_ps(orig_x2, _mm_set_ps1(f[0])),
                                        _mm_mul_ps(orig_y2, _mm_set_ps1(f[3]))
									))))))));
		}
	}
	for (int x = 0; x < gx; x++)
	{
		for (int y = 0; y < gy; y+=4)
		{
			// const float u2 = this->x_mesh[x][y] - this->cx_mesh[x][y];
			// const float v2 = this->y_mesh[x][y] - this->cy_mesh[x][y];
			const __m128 u2 = _mm_sub_ps(_mm_load_ps(&this->x_mesh[x][y]),_mm_load_ps(&this->cx_mesh[x][y]));
			const __m128 v2 = _mm_sub_ps(_mm_load_ps(&this->y_mesh[x][y]),_mm_load_ps(&this->cy_mesh[x][y]));

			// const float rot = this->rot_mesh[x][y];
			// const float cos_rot = cosf(rot);
			// const float sin_rot = sinf(rot);
			__m128 sin_rot, cos_rot;
			_mm_sincosf(_mm_load_ps(&this->rot_mesh[x][y]), sin_rot, cos_rot);

			// this->x_mesh[x][y] = u2 * cos_rot - v2 * sin_rot + this->cx_mesh[x][y] - this->dx_mesh[x][y];
			_mm_store_ps(&this->x_mesh[x][y],
				_mm_add_ps(
					_mm_sub_ps(_mm_mul_ps(u2, cos_rot), _mm_mul_ps(v2,sin_rot)),
					_mm_sub_ps(_mm_load_ps(&this->cx_mesh[x][y]), _mm_load_ps(&this->dx_mesh[x][y]))
					));
			// this->y_mesh[x][y] = u2 * sin_rot + v2 * cos_rot + this->cy_mesh[x][y] - this->dy_mesh[x][y];
			_mm_store_ps(&this->y_mesh[x][y],
				_mm_add_ps(
					_mm_add_ps(_mm_mul_ps(u2, sin_rot), _mm_mul_ps(v2,cos_rot)),
					_mm_sub_ps(_mm_load_ps(&this->cy_mesh[x][y]), _mm_load_ps(&this->dy_mesh[x][y]))
					));
		}
	}
}

Пример #10

0

Показать файл

Файл: Multitap.cpp Проект: bdejong/smartelectronix

//hosts that have power-of-2 blocksizes
//ann aligned memory will allways end up doing
//sse processing...
void Multitap::process(float *inputs, float *outputs, unsigned long nSamples, bool replace)
{
	//no use in using SSE for less samples then 16!
	//host calling VST plugins with less than 16-sample buffers should be shot, tortured and shot again
	if(nSamples <= 16 || !sse)
	{
		processFPU(inputs,outputs,nSamples,replace);
		return;
	}

	//let's see if the current index is a multiple of 4
	//if it isn't, we need to process untill it *IS*
	unsigned long startSize = (4 - (indexfpu & 3)) & 3;
	unsigned long blockSize = 0;

	//stupid, but who cares ;-)
	while(startSize + blockSize + 4 <= nSamples)
		blockSize += 4;

	//we'll have to process a maximum of 4 samples at the end...
	unsigned long endSize = nSamples - (startSize + blockSize);

	if(startSize)
		processFPU(inputs,outputs,startSize,replace);

	inputs += startSize;
	outputs += startSize;

	if(blockSize)
	{
		nSamples = blockSize;

		unsigned long index = indexfpu >> 2;

#ifndef _WIN64
		_mm_empty(); // No MMX on x64
#endif

		_mm_prefetch(((char *)&delay[0]),0);
		_mm_prefetch(((char *)&amp[0]),0);

		//are the buffers 16-byte aligned??
		if ((((int)inputs & 15) == 0) && (((int)outputs & 15) == 0))
		{
			nSamples >>= 2;
			while(nSamples--)
			{
				float *x = inputs + 4;
				float *y = outputs + 4;

				_mm_prefetch((char *) x,0);
				_mm_prefetch((char *) y,0);

				buffer[index] = _mm_load_ps(inputs);

				__m128 out_sse = _mm_setzero_ps();

				for(long z=0;z<32;z+=4)
				{
					long tmp1 = (index - delay[z+0]) & mask;
					long tmp2 = (index - delay[z+1]) & mask;
					long tmp3 = (index - delay[z+2]) & mask;
					long tmp4 = (index - delay[z+3]) & mask;

					//out += amp[z] * buffer[tmp1]
					out_sse = _mm_add_ps(_mm_mul_ps(amp[z],buffer[tmp1]),out_sse);
					_mm_prefetch(((char *)&buffer[tmp1]) + 16,0);

					out_sse = _mm_add_ps(_mm_mul_ps(amp[z+1],buffer[tmp2]),out_sse);
					_mm_prefetch(((char *)&buffer[tmp2]) + 16,0);

					out_sse = _mm_add_ps(_mm_mul_ps(amp[z+2],buffer[tmp3]),out_sse);
					_mm_prefetch(((char *)&buffer[tmp3]) + 16,0);

					out_sse = _mm_add_ps(_mm_mul_ps(amp[z+3],buffer[tmp4]),out_sse);
					_mm_prefetch(((char *)&buffer[tmp4]) + 16,0);
				}

				if(replace)
					_mm_store_ps(outputs,out_sse);
				else
					_mm_store_ps(outputs,_mm_add_ps(out_sse,_mm_load_ps(outputs)));

				index++;
				index &= mask;

				inputs = x;
				outputs = y;
			}
		}
		else //non-aligned buffers!
		{

Пример #11

0

Показать файл

Файл: sse_mult_su3_na.c Проект: alvarodlg/lotsofcoresbook2code

void
intrin_sse_mult_su3_na(su3_matrixf* aa, su3_matrixf* bb, su3_matrixf* cc)
{

	 /* XMM Variables */
	 __m128 xmm2, xmm3, xmm0, xmm1, xmm6, xmm7, xmm4, xmm5;

	xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->e[0][0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->e[0][1]) );
	xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->e[0][2]) );
	xmm0 = _mm_loadh_pi(xmm0, (__m64 *)&((bb)->e[1][0]) );
	xmm1 = _mm_loadh_pi(xmm1, (__m64 *)&((bb)->e[1][1]) );
	xmm2 = _mm_loadh_pi(xmm2, (__m64 *)&((bb)->e[1][2]) );
	xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[0][1].real) );
	xmm4 = _mm_load_ss((float *)&((aa)->e[1][0].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) );
	xmm5 = _mm_load_ss((float *)&((aa)->e[2][0].real) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm3, 0x00 );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm4 = _mm_shuffle_ps( xmm4, xmm4, 0x00 );
	xmm3 = _mm_mul_ps( xmm3, xmm0 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm5 = _mm_shuffle_ps( xmm5, xmm5, 0x00 );
	xmm4 = _mm_mul_ps( xmm4, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm6 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm5 = _mm_mul_ps( xmm5, xmm0 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[0][2].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm5 = _mm_add_ps( xmm5, xmm6 );
	xmm3 = _mm_add_ps( xmm3, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[1][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][2].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm4 = _mm_add_ps( xmm4, xmm6 );
	xmm5 = _mm_add_ps( xmm5, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[0][0].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xb1 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0xb1 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xb1 );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	 	 xmm0 = _mm_xor_ps( xmm0, _sse_sgn24.xmm );
	 	 xmm1 = _mm_xor_ps( xmm1, _sse_sgn24.xmm );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn24.xmm );
	xmm6 = _mm_mul_ps( xmm6, xmm0 );
	xmm7 = _mm_mul_ps( xmm7, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm6 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][2].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm2 );
	xmm7 = _mm_mul_ps( xmm7, xmm0 );
	xmm5 = _mm_add_ps( xmm5, xmm6 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	xmm6 = _mm_load_ss((float *)&((aa)->e[0][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm6 );
	xmm5 = _mm_add_ps( xmm5, xmm7 );
	xmm0 = _mm_load_ss((float *)&((aa)->e[0][2].imag) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x00 );
	xmm6 = _mm_shuffle_ps( xmm6, xmm6, 0x00 );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x00 );
	xmm0 = _mm_mul_ps( xmm0, xmm2 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm0 );
	xmm5 = _mm_add_ps( xmm5, xmm6 );
	xmm4 = _mm_add_ps( xmm4, xmm7 );
	 	 xmm3 = _mm_xor_ps( xmm3, _sse_sgn24.xmm );
	 	 xmm4 = _mm_xor_ps( xmm4, _sse_sgn24.xmm );
	 	 xmm5 = _mm_xor_ps( xmm5, _sse_sgn24.xmm );
	_mm_storel_pi((__m64 *)&((cc)->e[0][0]), xmm3 );
	_mm_storel_pi((__m64 *)&((cc)->e[1][0]), xmm4 );
	_mm_storel_pi((__m64 *)&((cc)->e[2][0]), xmm5 );
	_mm_storeh_pi((__m64 *)&((cc)->e[0][1]), xmm3 );
	_mm_storeh_pi((__m64 *)&((cc)->e[1][1]), xmm4 );
	_mm_storeh_pi((__m64 *)&((cc)->e[2][1]), xmm5 );
	xmm0 = _mm_loadl_pi(xmm0, (__m64 *)&((bb)->e[2][0]) );
	xmm1 = _mm_loadl_pi(xmm1, (__m64 *)&((bb)->e[2][1]) );
	xmm2 = _mm_loadl_pi(xmm2, (__m64 *)&((bb)->e[2][2]) );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0x44 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x44 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0x44 );
	xmm3 = _mm_load_ss((float *)&((aa)->e[0][0].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].real) );
	xmm3 = _mm_shuffle_ps( xmm3, xmm7, 0x00 );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][1].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].real) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm3 = _mm_mul_ps( xmm3, xmm0 );
	xmm4 = _mm_mul_ps( xmm4, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].real) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].real) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].real) );
	xmm6 = _mm_load_ss((float *)&((aa)->e[2][1].real) );
	xmm6 = _mm_shuffle_ps( xmm6, xmm7, 0x00 );
	xmm6 = _mm_mul_ps( xmm6, xmm1 );
	xmm0 = _mm_shuffle_ps( xmm0, xmm0, 0xB1 );
	 	 xmm0 = _mm_xor_ps( xmm0, _sse_sgn24.xmm );
	xmm1 = _mm_shuffle_ps( xmm1, xmm1, 0x11 );
	 	 xmm1 = _mm_xor_ps( xmm1, _sse_sgn24.xmm );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB1 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn24.xmm );
	xmm4 = _mm_load_ss((float *)&((aa)->e[0][0].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][0].imag) );
	xmm4 = _mm_shuffle_ps( xmm4, xmm7, 0x00 );
	xmm4 = _mm_mul_ps( xmm4, xmm0 );
	xmm3 = _mm_add_ps( xmm3, xmm4 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][1].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	xmm5 = _mm_load_ss((float *)&((aa)->e[0][2].imag) );
	xmm7 = _mm_load_ss((float *)&((aa)->e[1][2].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm2 );
	xmm3 = _mm_add_ps( xmm3, xmm5 );
	 	 xmm3 = _mm_xor_ps( xmm3, _sse_sgn24.xmm );
	_mm_storel_pi((__m64 *)&((cc)->e[0][2]), xmm3 );
	_mm_storeh_pi((__m64 *)&((cc)->e[1][2]), xmm3 );
	xmm1 = _mm_shuffle_ps( xmm1, xmm0, 0x44 );
	xmm7 = _mm_load_ss((float *)&((aa)->e[2][0].imag) );
	xmm5 = _mm_load_ss((float *)&((aa)->e[2][1].imag) );
	xmm5 = _mm_shuffle_ps( xmm5, xmm7, 0x00 );
	xmm5 = _mm_mul_ps( xmm5, xmm1 );
	xmm6 = _mm_add_ps( xmm6, xmm5 );
	xmm2 = _mm_shuffle_ps( xmm2, xmm2, 0xB4 );
	 	 xmm2 = _mm_xor_ps( xmm2, _sse_sgn3.xmm );
	xmm7 = _mm_loadl_pi(xmm7, (__m64 *)&((aa)->e[2][2]) );
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0x05 );
	xmm7 = _mm_mul_ps( xmm7, xmm2 );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	xmm7 = xmm6 ; 
	xmm7 = _mm_shuffle_ps( xmm7, xmm7, 0xEE );
	xmm6 = _mm_add_ps( xmm6, xmm7 );
	 	 xmm6 = _mm_xor_ps( xmm6, _sse_sgn24.xmm );
	_mm_storel_pi((__m64 *)&((cc)->e[2][2]), xmm6 );
}

Пример #12

0

Показать файл

Файл: sgemm-openmp.c Проект: Taiyi/UC-Berkeley-CS61C

/** Assumes input matrix has dimension n divisible by STRIDE. */
inline void squarepad_sgemm (const int n, float *A, float *B, float *C) {
    omp_set_num_threads(NUM_THREADS);
    #pragma omp parallel
    {
	__m128 mmA1, mmA2, mmA3, mmA4, mmB1, mmB2, mmB3, mmB4, mmC;
	__m128 mmProd1, mmProd2, mmProd3, mmProd4;
	__m128 sum0, sum1, sum2, sum3, sum4, sum5, sum6, sum7;
	float tempSum0[4], tempSum1[4], tempSum2[4], tempSum3[4], tempSum4[4];
	float tempSum5[4], tempSum6[4], tempSum7[4];
	int J_STRIDE, I_STRIDE;
    
	if (n < 512) {
	    J_STRIDE = BLOCK_BIG;
	    I_STRIDE = BLOCK_BIG;
	} else if (n < 768) {
	    I_STRIDE = BLOCK_MED;
	    J_STRIDE = BLOCK_BIG;
	} else if (n < 801) {
	    I_STRIDE = BLOCK_MED;
	    J_STRIDE = BLOCK_MED;
	} else if (n < 830) {
	    I_STRIDE = BLOCK_MED;
	    J_STRIDE = BLOCK_SML;
	} else {
	    I_STRIDE = BLOCK_MED;
	    J_STRIDE = BLOCK_ONE;
	}
        #pragma omp for schedule(dynamic) nowait
	for (int j = 0; j < n; j += J_STRIDE)
	for (int i = 0; i < n; i += I_STRIDE)
	for (int j2 = j; j2 < (j + J_STRIDE); j2++)
        for (int i2 = i; i2 < (i + I_STRIDE); i2 += 8) {
	    sum0 = _mm_set1_ps(0); sum1 = _mm_set1_ps(0);
	    sum2 = _mm_set1_ps(0); sum3 = _mm_set1_ps(0);
	    sum4 = _mm_set1_ps(0); sum5 = _mm_set1_ps(0);
	    sum6 = _mm_set1_ps(0); sum7 = _mm_set1_ps(0);
	    for (int k = 0; k < n; k += K_STRIDE) {
		mmB1 = _mm_load_ps(B + j2*n + k); //Bload
		mmB2 = _mm_load_ps(B + j2*n + k + 4);
		mmB3 = _mm_load_ps(B + j2*n + k + 8);
		mmB4 = _mm_load_ps(B + j2*n + k + 12);
		    
		mmA1 = _mm_load_ps(A + i2*n + k); //0ALoad
		mmA2 = _mm_load_ps(A + i2*n + k + 4);
		mmA3 = _mm_load_ps(A + i2*n + k + 8);
		mmA4 = _mm_load_ps(A + i2*n + k + 12);
		    
		mmProd1 = _mm_mul_ps(mmA1, mmB1); //0Product
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 1)*n + k); //1A
		mmA2 = _mm_load_ps(A + (i2 + 1)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 1)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 1)*n + k + 12);
		
		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //0Sum
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum0 = _mm_add_ps(mmProd1, sum0);
		    
		mmProd1 = _mm_mul_ps(mmA1, mmB1); //1P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 2)*n + k); //2A
		mmA2 = _mm_load_ps(A + (i2 + 2)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 2)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 2)*n + k + 12);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //1S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum1 = _mm_add_ps(mmProd1, sum1);

		mmProd1 = _mm_mul_ps(mmA1, mmB1); //2P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 3)*n + k); //3A
		mmA2 = _mm_load_ps(A + (i2 + 3)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 3)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 3)*n + k + 12);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //2S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum2 = _mm_add_ps(mmProd1, sum2);

		mmProd1 = _mm_mul_ps(mmA1, mmB1); //3P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 4)*n + k); //4A
		mmA2 = _mm_load_ps(A + (i2 + 4)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 4)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 4)*n + k + 12);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //3S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum3 = _mm_add_ps(mmProd1, sum3);

		mmProd1 = _mm_mul_ps(mmA1, mmB1); //4P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 5)*n + k); //5A
		mmA2 = _mm_load_ps(A + (i2 + 5)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 5)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 5)*n + k + 12);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //4S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum4 = _mm_add_ps(mmProd1, sum4);

		mmProd1 = _mm_mul_ps(mmA1, mmB1); //5P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 6)*n + k); //6A
		mmA2 = _mm_load_ps(A + (i2 + 6)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 6)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 6)*n + k + 12);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //5S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum5 = _mm_add_ps(mmProd1, sum5);

		mmProd1 = _mm_mul_ps(mmA1, mmB1); //6P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmA1 = _mm_load_ps(A + (i2 + 7)*n + k); //7A
		mmA2 = _mm_load_ps(A + (i2 + 7)*n + k + 4);
		mmA3 = _mm_load_ps(A + (i2 + 7)*n + k + 8);
		mmA4 = _mm_load_ps(A + (i2 + 7)*n + k + 12);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //6S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum6 = _mm_add_ps(mmProd1, sum6);

		mmProd1 = _mm_mul_ps(mmA1, mmB1); //7P
		mmProd2 = _mm_mul_ps(mmA2, mmB2);
		mmProd3 = _mm_mul_ps(mmA3, mmB3);
		mmProd4 = _mm_mul_ps(mmA4, mmB4);

		mmProd1 = _mm_add_ps(mmProd1, mmProd2); //7S
		mmProd3 = _mm_add_ps(mmProd3, mmProd4);
		mmProd1 = _mm_add_ps(mmProd1, mmProd3);
		sum7 = _mm_add_ps(mmProd1, sum7);	    
	    }
	    sum0 = _mm_hadd_ps(sum0, sum1);
	    sum2 = _mm_hadd_ps(sum2, sum3);
	    sum0 = _mm_hadd_ps(sum0, sum2);

	    sum4 = _mm_hadd_ps(sum4, sum5);
	    sum6 = _mm_hadd_ps(sum6, sum7);
	    sum4 = _mm_hadd_ps(sum4, sum6);

	    mmC = _mm_load_ps(C + j2*n + i2);
	    mmC = _mm_add_ps(mmC, sum0);
	    _mm_store_ps(C + j2*n + i2, mmC);

	    mmC = _mm_load_ps(C + j2*n + i2 + 4);
	    mmC = _mm_add_ps(mmC, sum4);
	    _mm_store_ps(C + j2*n + i2 + 4, mmC);
	}
    }
}

Пример #13

0

Показать файл

Файл: nx_sse_float.hpp Проект: mywoodstock/nxsimd

 inline vector4f operator*(const vector4f& lhs, const vector4f& rhs)
 {
     return _mm_mul_ps(lhs, rhs);
 }

Пример #14

0

Показать файл

Файл: analyzeFuncs_sse3.cpp Проект: whatmorereason/seti_fpga

int sse3_ChirpData_ak8(
    sah_complex * cx_DataArray,
    sah_complex * cx_ChirpDataArray,
    int chirp_rate_ind,
    double chirp_rate,
    int  ul_NumDataPoints,
    double sample_rate
) {
#ifdef USE_MANUAL_CALLSTACK
    call_stack.enter("sse3_ChirpData_ak8()");
#endif 
    int i;

    if (chirp_rate_ind == 0) {
      memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
#ifdef USE_MANUAL_CALLSTACK
      call_stack.exit();
#endif 
      return 0;
    }

    int vEnd;
    double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
    __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
    __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);
    __m128d DFOUR = _mm_set_pd(4.0, 4.0);


    // main vectorised loop
    vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
    __m128d di1 = _mm_set_pd(2.0, 0.0);                 // set time patterns for eventual moveldup/movehdup
    __m128d di2 = _mm_set_pd(3.0, 1.0);

    for (i = 0; i < vEnd; i += 4) {
      const float *d = (const float *) (cx_DataArray + i);
      float *cd = (float *) (cx_ChirpDataArray + i);

      __m128d a1, a2;

      __m128 d1, d2;
      __m128 cd1, cd2;
      __m128 td1, td2;

      __m128 x;
      __m128 y;
      __m128 z;
      __m128 s;
      __m128 c;
      __m128 m;

      // load the signal to be chirped
      d1 = _mm_load_ps(d);
      d2 = _mm_load_ps(d+4);

      // calculate the input angle
      a1 = _mm_mul_pd(_mm_mul_pd(di1, di1), rate);
      a2 = _mm_mul_pd(_mm_mul_pd(di2, di2), rate);

      // update times for next
      di1 = _mm_add_pd(di1, DFOUR);
      di2 = _mm_add_pd(di2, DFOUR);

      // reduce the angle to the range (-0.5, 0.5)
      a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
      a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

      // convert pair of packed double into packed single
      x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));               // 3   1   2   0

      // square to the range [0, 0.25)
      y = _mm_mul_ps(x, x);

      // perform the initial polynomial approximations, Estrin's method
      z = _mm_mul_ps(y, y);

      s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4F),
                                                      SS3F),
                                           z),
                                _mm_add_ps(_mm_mul_ps(y, SS2F),
                                           SS1F)),
                     x);
      c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3F),
                                           CC2F),
                                z),
                     _mm_add_ps(_mm_mul_ps(y, CC1F),
                                ONE));

      // perform first angle doubling
      x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
      y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

      // calculate scaling factor to correct the magnitude
      m = _mm_sub_ps(_mm_sub_ps(TWO, _mm_mul_ps(x, x)), _mm_mul_ps(y, y));

      // perform second angle doubling
      c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
      s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

      // correct the magnitude (final sine / cosine approximations)
      c = _mm_mul_ps(c, m);                                       // c3    c1    c2    c0
      s = _mm_mul_ps(s, m);

      // chirp the data
      cd1 = _mm_moveldup_ps(c);                                   // c1    c1    c0    c0
      cd2 = _mm_movehdup_ps(c);                                   // c3    c3    c2    c2
      cd1 = _mm_mul_ps(cd1, d1);                                  // c1.i1 c1.r1 c0.i0 c0.r0
      cd2 = _mm_mul_ps(cd2, d2);                                  // c3.i3 c3.r3 c2.i2 c2.r2
      d1 = _mm_shuffle_ps(d1, d1, 0xb1);
      d2 = _mm_shuffle_ps(d2, d2, 0xb1);
      td1 = _mm_moveldup_ps(s);
      td2 = _mm_movehdup_ps(s);
      td1 = _mm_mul_ps(td1, d1);
      td2 = _mm_mul_ps(td2, d2);
      cd1 = _mm_addsub_ps(cd1, td1);
      cd2 = _mm_addsub_ps(cd2, td2);

      // store chirped values
      _mm_stream_ps(cd, cd1);
      _mm_stream_ps(cd+4, cd2);
    }

    // handle tail elements with scalar code
    for (; i < ul_NumDataPoints; ++i) {
      double angle = srate * i * i * 0.5;
      double s = sin(angle);
      double c = cos(angle);

      float re = cx_DataArray[i][0];
      float im = cx_DataArray[i][1];

      cx_ChirpDataArray[i][0] = re * c - im * s;
      cx_ChirpDataArray[i][1] = re * s + im * c;
    }
    analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;
#ifdef USE_MANUAL_CALLSTACK
    call_stack.exit();
#endif 
    return 0;
}

Пример #15

0

Показать файл

Файл: sgemm-small.c Проект: efossier/Matrix-Multiply-CS61C

void sgemm( int m_a, int n_a, float *A, float *B, float *C ) {
  __m128 partialRowB1,partialRowB2,partialRowB3;
  __m128 partialSumC1,partialSumC2,partialSumC3;
  __m128 partialColA1,partialColA2,partialColA3,partialColA4,partialColA5,partialColA6,partialColA7,partialColA8,partialColA9;
 
  if(m_a%36 == 0 && n_a%36 == 0){ //Optimized for 36x36 matrix
	  for( int v = 0; v < m_a; v++){
		  int j;
		  //Break into 12x12 matrices
		  for( j = 0; j < n_a/3*3; j+= 3 ) {
			//load next 3 elems of row B
			partialRowB1 = _mm_load1_ps(B+(v+m_a*j)); 
			partialRowB2 = _mm_load1_ps(B+(v+m_a*(j+1)));
			partialRowB3 = _mm_load1_ps(B+(v+m_a*(j+2)));
			int i;
			for( i = 0; i < m_a/12*12; i += 12 ) {
			  //Load 12 elems from column 1 of A
			  partialColA1 = _mm_loadu_ps(A+(i+m_a*j)); 
			  partialColA2 = _mm_loadu_ps(A+(i+m_a*j+4));
			  partialColA3 = _mm_loadu_ps(A+(i+m_a*j+8));
			  //load 12 elems from column 2 of A
			  partialColA4 = _mm_loadu_ps(A+(i+m_a*(j+1)));
			  partialColA5 = _mm_loadu_ps(A+(i+m_a*(j+1)+4));
			  partialColA6 = _mm_loadu_ps(A+(i+m_a*(j+1)+8));
			  //load 12 elems from column 3 of A
			  partialColA7 = _mm_loadu_ps(A+(i+m_a*(j+2)));
			  partialColA8 = _mm_loadu_ps(A+(i+m_a*(j+2)+4));
			  partialColA9 = _mm_loadu_ps(A+(i+m_a*(j+2)+8));
			  
			  //Multiply first col of A with first elem of B & sum into respective elem of C
			  partialSumC1 = _mm_add_ps(_mm_loadu_ps(C+(i+v*m_a)), _mm_mul_ps(partialColA1, partialRowB1));
			  partialSumC2 = _mm_add_ps(_mm_loadu_ps(C+(i+v*m_a+4)), _mm_mul_ps(partialColA2, partialRowB1));
			  partialSumC3 = _mm_add_ps(_mm_loadu_ps(C+(i+v*m_a+8)), _mm_mul_ps(partialColA3, partialRowB1));
			  
			  //Multiply second col of A with second elem of B & sum into respective elem of C
			  partialSumC1 = _mm_add_ps(partialSumC1, _mm_mul_ps(partialColA4, partialRowB2));
			  partialSumC2 = _mm_add_ps(partialSumC2, _mm_mul_ps(partialColA5, partialRowB2));
			  partialSumC3 = _mm_add_ps(partialSumC3, _mm_mul_ps(partialColA6, partialRowB2));
			  
			  //Multiply last col of A with last elem of B & sum into respective C & store
			  _mm_storeu_ps(C+i+v*m_a, _mm_add_ps(partialSumC1, _mm_mul_ps(partialColA7, partialRowB3)));
			  _mm_storeu_ps(C+i+v*m_a+4, _mm_add_ps(partialSumC2, _mm_mul_ps(partialColA8, partialRowB3)));
			  _mm_storeu_ps(C+i+v*m_a+8, _mm_add_ps(partialSumC3, _mm_mul_ps(partialColA9, partialRowB3)));
			}
		  }
	  }
   }
   //Handles matrices of size other than 36x36
   else{
	  for(int v = 0; v < n_a; v++){ //goes through output column in C
		int m_axv = m_a*v;
		for( int j = 0; j < m_a; j++ ) {
			int jxm_a = j*m_a;
			partialRowB1 = _mm_load1_ps(B+(j+m_axv)); //load current elem of row in B
			int i;
			for( i = 0; i < m_a/16*16; i += 16 ) {
			  //load next 16 col elems of A into packed sp
			  partialColA1 = _mm_loadu_ps(A+(i+m_axv)); 
			  partialColA2 = _mm_loadu_ps(A+(i+m_axv+4));
			  partialColA3 = _mm_loadu_ps(A+(i+m_axv+8));
			  partialColA4 = _mm_loadu_ps(A+(i+m_axv+12));
			  
			  //Compute part of elem in C, store in C
			  _mm_storeu_ps((C + i + jxm_a), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a), _mm_mul_ps(partialColA1, partialRowB1)));
			  _mm_storeu_ps((C + i + jxm_a+4), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a+4), _mm_mul_ps(partialColA2, partialRowB1)));
			  _mm_storeu_ps((C + i + jxm_a+8), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a+8), _mm_mul_ps(partialColA3, partialRowB1)));
			  _mm_storeu_ps((C + i + jxm_a+12), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a+12), _mm_mul_ps(partialColA4, partialRowB1)));
			}
			//fringe case
			for( i = m_a/16*16; i < m_a/4*4; i += 4 ) {
			  partialColA1 = _mm_loadu_ps(A+(i+m_axv)); //load next 4 col elems of A into packed sp
			  _mm_storeu_ps((C + i + jxm_a), _mm_add_ps(_mm_loadu_ps(C+i+jxm_a), _mm_mul_ps(partialColA1, partialRowB1)));
			}
			//finish off matrices with dimensions %4 != 0
			for( i = m_a/4*4; i < m_a; i++){
				C[i + jxm_a] += A[i+m_axv] * B[j+m_axv];
			}
		  }
	  }
	}
}

Пример #16

0

Показать файл

Файл: analyzeFuncs_sse3.cpp Проект: whatmorereason/seti_fpga

// =============================================================================
//
// sse3_vChirpData
// version by: Alex Kan
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse3_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;
#ifdef USE_MANUAL_CALLSTACK
  call_stack.enter("sse3_ChirpData_ak()");
#endif 

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
#ifdef USE_MANUAL_CALLSTACK
    call_stack.exit();
#endif 
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate);
    a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate);

    // reduce the angle to the range (-0.5, 0.5)
    a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
    a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4),
                                    SS3),
                                y),
                          SS2),
                    y),
              SS1),
          x);
    c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3),
                                CC2),
                          y),
                    CC1),
              y),
          ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    s = _mm_mul_ps(s, m);
    c = _mm_mul_ps(c, m);

    // chirp the data
    cd1 = _mm_shuffle_ps(c, c, 0x50);
    cd2 = _mm_shuffle_ps(c, c, 0xfa);
    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    d1 = _mm_shuffle_ps(d1, d1, 0xb1);
    d2 = _mm_shuffle_ps(d2, d2, 0xb1);
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);
    td1 = _mm_mul_ps(td1, d1);
    td2 = _mm_mul_ps(td2, d2);
    cd1 = _mm_addsub_ps(cd1, td1);
    cd2 = _mm_addsub_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  // handle tail elements with scalar code
  for (   ; i < ul_NumDataPoints; ++i) {
    double angle = srate * i * i * 0.5;
    double s = sin(angle);
    double c = cos(angle);
    float re = cx_DataArray[i][0];
    float im = cx_DataArray[i][1];

    cx_ChirpDataArray[i][0] = re * c - im * s;
    cx_ChirpDataArray[i][1] = re * s + im * c;
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

#ifdef USE_MANUAL_CALLSTACK
  call_stack.exit();
#endif 
  return 0;
}

Пример #17

0

Показать файл

Файл: Mat44.cpp Проект: wrdn/GameEngine

void Mat44::Cramers_Inverse_SSE(const Mat44 *out, f32 &detv) const
{
	f32 *src = (f32*)&mat;

	__m128 minor0=_mm_setzero_ps(), minor1=_mm_setzero_ps(), minor2=_mm_setzero_ps(), minor3=_mm_setzero_ps();
	__m128 row0=_mm_setzero_ps(),   row1=_mm_setzero_ps(),   row2=_mm_setzero_ps(),   row3=_mm_setzero_ps();
	__m128 det=_mm_setzero_ps(),    tmp1=_mm_setzero_ps();

	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src)), (__m64*)(src+ 4));
	row1 = _mm_loadh_pi(_mm_loadl_pi(row1, (__m64*)(src+8)), (__m64*)(src+12));
	row0 = _mm_shuffle_ps(tmp1, row1, 0x88);
	row1 = _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1 = _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64*)(src+ 2)), (__m64*)(src+ 6));
	row3 = _mm_loadh_pi(_mm_loadl_pi(row3, (__m64*)(src+10)), (__m64*)(src+14));
	row2 = _mm_shuffle_ps(tmp1, row3, 0x88);
	row3 = _mm_shuffle_ps(row3, tmp1, 0xDD);
	
	tmp1 = _mm_mul_ps(row2, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_mul_ps(row1, tmp1);
	minor1 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1 = _mm_shuffle_ps(minor1, minor1, 0x4E);
	
	tmp1 = _mm_mul_ps(row1, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3 = _mm_shuffle_ps(minor3, minor3, 0x4E);
	
	tmp1 = _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2 = _mm_shuffle_ps(row2, row2, 0x4E);
	minor0 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2 = _mm_mul_ps(row0, tmp1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0 = _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2 = _mm_shuffle_ps(minor2, minor2, 0x4E);
	
	tmp1 = _mm_mul_ps(row0, row1);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2 = _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row3);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2 = _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));
	
	tmp1 = _mm_mul_ps(row0, row2);
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1 = _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3 = _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1 = _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1 = _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3 = _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);
	
	det = _mm_mul_ps(row0, minor0);
	det = _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det = _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1 = _mm_rcp_ss(det);
	det = _mm_sub_ss(_mm_add_ss(tmp1, tmp1), _mm_mul_ss(det, _mm_mul_ss(tmp1, tmp1)));
	det = _mm_shuffle_ps(det, det, 0x00);
	
	_mm_store_ss(&detv, det);

	Mat44 t;
	if(out)
	{
		src = (f32*)out->mat;
	}
	else
	{
		src = t.mat;
	}

	minor0 = _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64*)(src), minor0);
	_mm_storeh_pi((__m64*)(src+2), minor0);
	
	minor1 = _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64*)(src+4), minor1);
	_mm_storeh_pi((__m64*)(src+6), minor1);
	
	minor2 = _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64*)(src+ 8), minor2);
	_mm_storeh_pi((__m64*)(src+10), minor2);
	
	minor3 = _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64*)(src+12), minor3);
	_mm_storeh_pi((__m64*)(src+14), minor3);
};

Пример #18

0

Показать файл

Файл: game.c Проект: iamgreaser/4

void game_player_tick(player_t *pl, float dt)
{
	if(pl->magic != 0xC4 && pl->magic != 0xC9 && pl->magic != 0x66 && pl->magic != 0x69)
		return;
	
	camera_t *cam = &(pl->cam);

	float vs = 0.12f*100.0f*dt;

	// trace motion
	v4f_t no, tno, tv;
	no.m = _mm_setzero_ps();
	if(pl->magic != 0x66 && pl->magic != 0x69)
	{
		if(pl->vflip)
		{
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.x.m, _mm_set1_ps(-pl->lv.v.x*vs)));
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.y.m, _mm_set1_ps(pl->lv.v.y*vs)));
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.z.m, _mm_set1_ps(pl->lv.v.w*vs)));
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.w.m, _mm_set1_ps(pl->lv.v.z*vs)));
		} else {
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.x.m, _mm_set1_ps(pl->lv.v.x*vs)));
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.y.m, _mm_set1_ps(pl->lv.v.y*vs)));
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.z.m, _mm_set1_ps(pl->lv.v.z*vs)));
			no.m = _mm_add_ps(no.m, _mm_mul_ps(cam->m.v.w.m, _mm_set1_ps(pl->lv.v.w*vs)));
		}
	}

	// add gravity
	pl->grav_v += 0.3f*vs;
	if(pl->grav_v > 2.5f)
		pl->grav_v = 2.5f;
	no.v.y = pl->grav_v*vs;

	// check distance
	v4f_t tv2;
	tv2.m = _mm_mul_ps(no.m, no.m);
	//float md = sqrtf(vx*vx + vy*vy + vz*vz + vw*vw)*vs;
	float md = tv2.v.x + tv2.v.y + tv2.v.z + tv2.v.w;

	if(md > 0.0000001f)
	{
		// normalise for direction
		tv.m = no.m;
		vec_norm(&tv);

		// cast a ray
		float r = 0.5f;
		tno.m = cam->o.m;
		int side;

		// trace away
		for(;;)
		{
			float d = trace_box(root, &tno, &tv, NULL, NULL, NULL, md + r, &side);

			//
			// apply collision
			//

			if(d < 0.0f)
			{
				// no collision. jump to point.
				cam->o.m = _mm_add_ps(cam->o.m,
					_mm_mul_ps(_mm_set1_ps(md), tv.m));

				break;
			} else {
				// we've hit a plane. slide back.
				if(side == F_YP)
				{
					if(pl->grav_v >= 0.0f)
					{
						pl->grav_v = 0.0f;
						pl->grounded = 1;
					}
				} else if(side == F_YN) {
					if(pl->grav_v <= 0.0f)
						pl->grav_v = 0.0f;
				}

				float dd = md - (d - r);

				cam->o.m = _mm_add_ps(cam->o.m,
					_mm_mul_ps(_mm_set1_ps(md - dd), tv.m));

				// mask out velocity.
				tv.a[side&3] = 0.0f;

				// reduce distance.
				md = dd;
			}
		}
	}
}

Пример #19

0

Показать файл

Файл: AEUtil.cpp Проект: AndyPeterman/mrmc

void CAEUtil::ClampArray(float *data, uint32_t count)
{
#if !defined(HAVE_SSE) || !defined(__SSE__)
  for (uint32_t i = 0; i < count; ++i)
    data[i] = SoftClamp(data[i]);

#else
  const __m128 c1 = _mm_set_ps1(27.0f);
  const __m128 c2 = _mm_set_ps1(27.0f + 9.0f);

  /* work around invalid alignment */
  while (((uintptr_t)data & 0xF) && count > 0)
  {
    data[0] = SoftClamp(data[0]);
    ++data;
    --count;
  }

  uint32_t even = count & ~0x3;
  for (uint32_t i = 0; i < even; i+=4, data+=4)
  {
    /* tanh approx clamp */
    __m128 dt = _mm_load_ps(data);
    __m128 tmp     = _mm_mul_ps(dt, dt);
    *(__m128*)data = _mm_div_ps(
      _mm_mul_ps(
        dt,
        _mm_add_ps(c1, tmp)
      ),
      _mm_add_ps(c2, tmp)
    );
  }

  if (even != count)
  {
    uint32_t odd = count - even;
    if (odd == 1)
      data[0] = SoftClamp(data[0]);
    else
    {
      __m128 dt;
      __m128 tmp;
      __m128 out;
      if (odd == 2)
      {
        /* tanh approx clamp */
        dt  = _mm_setr_ps(data[0], data[1], 0, 0);
        tmp = _mm_mul_ps(dt, dt);
        out = _mm_div_ps(
          _mm_mul_ps(
            dt,
            _mm_add_ps(c1, tmp)
          ),
          _mm_add_ps(c2, tmp)
        );

        data[0] = ((float*)&out)[0];
        data[1] = ((float*)&out)[1];
      }
      else
      {
        /* tanh approx clamp */
        dt  = _mm_setr_ps(data[0], data[1], data[2], 0);
        tmp = _mm_mul_ps(dt, dt);
        out = _mm_div_ps(
          _mm_mul_ps(
            dt,
            _mm_add_ps(c1, tmp)
          ),
          _mm_add_ps(c2, tmp)
        );

        data[0] = ((float*)&out)[0];
        data[1] = ((float*)&out)[1];
        data[2] = ((float*)&out)[2];
      }
    }
  }
#endif
}

Пример #20

0

Показать файл

Файл: m_matrix44_sse.c Проект: varialus/agar

M_Matrix44
M_MatrixInvert44_SSE(M_Matrix44 A)
{
	M_Matrix44 Ainv;
	float *src = &A.m[0][0];
	float *dst = &Ainv.m[0][0];
	__m128 minor0, minor1, minor2, minor3;
	__m128 row0, row1, row2, row3;
	__m128 det, tmp1;

	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src)),
	                                          (__m64 *)(src+4));
	row1	= _mm_loadh_pi(_mm_loadl_pi(row1, (__m64 *)(src+8)),
	                                          (__m64 *)(src+12));
	row0	= _mm_shuffle_ps(tmp1, row1, 0x88);
	row1	= _mm_shuffle_ps(row1, tmp1, 0xDD);
	tmp1	= _mm_loadh_pi(_mm_loadl_pi(tmp1, (__m64 *)(src+2)),
	                                          (__m64 *)(src+6));
	row3	= _mm_loadh_pi(_mm_loadl_pi(row3, (__m64 *)(src+10)),
	                                          (__m64 *)(src+14));
	row2	= _mm_shuffle_ps(tmp1, row3, 0x88);
	row3	= _mm_shuffle_ps(row3, tmp1, 0xDD);

	tmp1	= _mm_mul_ps(row2, row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0	= _mm_mul_ps(row1, tmp1);
	minor1	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(_mm_mul_ps(row1, tmp1), minor0);
	minor1	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor1);
	minor1	= _mm_shuffle_ps(minor1, minor1, 0x4E);

	tmp1	= _mm_mul_ps(row1, row2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor0	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor0);
	minor3	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row3, tmp1));
	minor3	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor3);
	minor3	= _mm_shuffle_ps(minor3, minor3, 0x4E);

	tmp1	= _mm_mul_ps(_mm_shuffle_ps(row1, row1, 0x4E), row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	row2	= _mm_shuffle_ps(row2, row2, 0x4E);
	minor0	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor0);
	minor2	= _mm_mul_ps(row0, tmp1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor0	= _mm_sub_ps(minor0, _mm_mul_ps(row2, tmp1));
	minor2	= _mm_sub_ps(_mm_mul_ps(row0, tmp1), minor2);
	minor2	= _mm_shuffle_ps(minor2, minor2, 0x4E);

	tmp1	= _mm_mul_ps(row0, row1);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor2	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3	= _mm_sub_ps(_mm_mul_ps(row2, tmp1), minor3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor2	= _mm_sub_ps(_mm_mul_ps(row3, tmp1), minor2);
	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row2, tmp1));

	tmp1	= _mm_mul_ps(row0, row3);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row2, tmp1));
	minor2	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1	= _mm_add_ps(_mm_mul_ps(row2, tmp1), minor1);
	minor2	= _mm_sub_ps(minor2, _mm_mul_ps(row1, tmp1));

	tmp1	= _mm_mul_ps(row0, row2);
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0xB1);
	minor1	= _mm_add_ps(_mm_mul_ps(row3, tmp1), minor1);
	minor3	= _mm_sub_ps(minor3, _mm_mul_ps(row1, tmp1));
	tmp1	= _mm_shuffle_ps(tmp1, tmp1, 0x4E);
	minor1	= _mm_sub_ps(minor1, _mm_mul_ps(row3, tmp1));
	minor3	= _mm_add_ps(_mm_mul_ps(row1, tmp1), minor3);

	det	= _mm_mul_ps(row0, minor0);
	det	= _mm_add_ps(_mm_shuffle_ps(det, det, 0x4E), det);
	det	= _mm_add_ss(_mm_shuffle_ps(det, det, 0xB1), det);
	tmp1	= _mm_rcp_ss(det);
	det	= _mm_sub_ss(_mm_add_ss(tmp1, tmp1),
	                     _mm_mul_ss(det, _mm_mul_ss(tmp1,tmp1)));
	det	= _mm_shuffle_ps(det, det, 0x00);

	minor0	= _mm_mul_ps(det, minor0);
	_mm_storel_pi((__m64 *)(dst), minor0);
	_mm_storeh_pi((__m64 *)(dst+2), minor0);

	minor1	= _mm_mul_ps(det, minor1);
	_mm_storel_pi((__m64 *)(dst+4), minor1);
	_mm_storeh_pi((__m64 *)(dst+6), minor1);
	
	minor2	= _mm_mul_ps(det, minor2);
	_mm_storel_pi((__m64 *)(dst+8), minor2);
	_mm_storeh_pi((__m64 *)(dst+10), minor2);
	
	minor3	= _mm_mul_ps(det, minor3);
	_mm_storel_pi((__m64 *)(dst+12), minor3);
	_mm_storeh_pi((__m64 *)(dst+14), minor3);

	return (Ainv);
}

Пример #21

0

Показать файл

Файл: graduatednd.c Проект: EvilBit/darktable

void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_graduatednd_data_t *data = (dt_iop_graduatednd_data_t *)piece->data;
  const int ch = piece->colors;

  const int ix= (roi_in->x);
  const int iy= (roi_in->y);
  const float iw=piece->buf_in.width*roi_out->scale;
  const float ih=piece->buf_in.height*roi_out->scale;
  const float hw=iw/2.0;
  const float hh=ih/2.0;
  const float hw_inv=1.0/hw;
  const float hh_inv=1.0/hh;
  const float v=(-data->rotation/180)*M_PI;
  const float sinv=sin(v);
  const float cosv=cos(v);
  const float filter_radie=sqrt((hh*hh)+(hw*hw))/hh;
  const float offset=data->offset/100.0*2;

  float color[3];
  hsl2rgb(color,data->hue,data->saturation,0.5);
  if (data->density < 0)
    for ( int l=0; l<3; l++ )
      color[l] = 1.0-color[l];

#if 1
  const float filter_compression = 1.0/filter_radie/(1.0-(0.5+(data->compression/100.0)*0.9/2.0))*0.5;
#else
  const float compression = data->compression/100.0f;
  const float t = 1.0f - .8f/(.8f + compression);
  const float c = 1.0f + 1000.0f*powf(4.0, compression);
#endif


  if (data->density > 0)
  {
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out, color, data, ivoid, ovoid) schedule(static)
#endif
    for(int y=0; y<roi_out->height; y++)
    {
      int k=roi_out->width*y*ch;
      const float *in = (float*)ivoid + k;
      float *out = (float*)ovoid + k;

      float length = (sinv * (-1.0+ix*hw_inv) - cosv * (-1.0+(iy+y)*hh_inv) - 1.0 + offset) * filter_compression;
      const float length_inc = sinv * hw_inv * filter_compression;

      __m128 c = _mm_set_ps(0,color[2],color[1],color[0]);
      __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f),c);

      for(int x=0; x<roi_out->width; x++, in+=ch, out+=ch)
      {
#if 1
        // !!! approximation is ok only when highest density is 8
        // for input x = (data->density * CLIP( 0.5+length ), calculate 2^x as (e^(ln2*x/8))^8
        // use exp2f approximation to calculate e^(ln2*x/8)
        // in worst case - density==8,CLIP(0.5-length) == 1.0 it gives 0.6% of error
        const float t = 0.693147181f /* ln2 */ * (data->density * CLIP( 0.5f+length )/8.0f);
        float d1 = t*t*0.5f;
        float d2 = d1*t*0.333333333f;
        float d3 = d2*t*0.25f;
        float d = 1+t+d1+d2+d3; /* taylor series for e^x till x^4 */
        //printf("%d %d  %f\n",y,x,d);
        __m128 density = _mm_set1_ps(d);
        density = _mm_mul_ps(density,density);
        density = _mm_mul_ps(density,density);
        density = _mm_mul_ps(density,density);
#else
        // use fair exp2f
        __m128 density = _mm_set1_ps(exp2f(data->density * CLIP( 0.5f+length )));
#endif

        /* max(0,in / (c + (1-c)*density)) */
        _mm_stream_ps(out,_mm_max_ps(_mm_set1_ps(0.0f),_mm_div_ps(_mm_load_ps(in),_mm_add_ps(c,_mm_mul_ps(c1,density)))));

        length += length_inc;
      }
    }
  }
  else
  {
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out, color, data, ivoid, ovoid) schedule(static)
#endif
    for(int y=0; y<roi_out->height; y++)
    {
      int k=roi_out->width*y*ch;
      const float *in = (float*)ivoid + k;
      float *out = (float*)ovoid + k;

      float length = (sinv * (-1.0f+ix*hw_inv) - cosv * (-1.0f+(iy+y)*hh_inv) - 1.0f + offset) * filter_compression;
      const float length_inc = sinv * hw_inv * filter_compression;

      __m128 c = _mm_set_ps(0,color[2],color[1],color[0]);
      __m128 c1 = _mm_sub_ps(_mm_set1_ps(1.0f),c);

      for(int x=0; x<roi_out->width; x++, in+=ch, out+=ch)
      {
#if 1
        // !!! approximation is ok only when lowest density is -8
        // for input x = (-data->density * CLIP( 0.5-length ), calculate 2^x as (e^(ln2*x/8))^8
        // use exp2f approximation to calculate e^(ln2*x/8)
        // in worst case - density==-8,CLIP(0.5-length) == 1.0 it gives 0.6% of error
        const float t = 0.693147181f /* ln2 */ * (-data->density * CLIP( 0.5f-length )/8.0f);
        float d1 = t*t*0.5f;
        float d2 = d1*t*0.333333333f;
        float d3 = d2*t*0.25f;
        float d = 1+t+d1+d2+d3; /* taylor series for e^x till x^4 */
        __m128 density = _mm_set1_ps(d);
        density = _mm_mul_ps(density,density);
        density = _mm_mul_ps(density,density);
        density = _mm_mul_ps(density,density);
#else
        __m128 density = _mm_set1_ps(exp2f(-data->density * CLIP( 0.5f-length )));
#endif

        /* max(0,in * (c + (1-c)*density)) */
        _mm_stream_ps(out,_mm_max_ps(_mm_set1_ps(0.0f),_mm_mul_ps(_mm_load_ps(in),_mm_add_ps(c,_mm_mul_ps(c1,density)))));

        length += length_inc;
      }
    }
  }
  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}