示例#1
0
void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF,
				 int imgFOfssetH, int imgFOfssetW,
				 float* filter, float *imgFloatSrc, float *imgFloatDst)
{

	//1.
	const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
	//2.
    const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0);

	//3.
	__declspec(align(16)) __m128 filter_l[FILTER_SIZE];
#pragma omp parallel for
	for (auto i = 0; i < FILTER_SIZE; i++)
	{
		//mind a 4 floatba ugyanazt tölti
		// float -> m128 konverzió
		filter_l[i] = _mm_load_ps1(filter + i);
	}
	const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2;
	const auto imgWidthbyte = imgWidth << 2;
	const auto imgWidthFbyte = imgWidthF << 2;
	const auto imgLengthbyte = imgHeight * imgWidthbyte;
	//4.
	register __declspec(align(16)) __m128 a_sse;
	//8. reg
	register __declspec(align(16)) __m128 r_sse;

#pragma omp parallel for
	for (auto row = 0; row < imgLengthbyte; row += 4)
	{
		// RGBA komponensek akkumulátora
		r_sse = _mm_setzero_ps();
		// konvolúció minden komponensre
		for (auto y = 0; y < FILTER_H; y++ )
		{		
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y]));
		}
			
		a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte);
		//számítás eredményének limitálása 0-255 közé
		// kimenetí pixel írása
		_mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse)))))));
	}
}
示例#2
0
float sumar(float *a){
    float sumaF[4] __attribute__((aligned(16)));
    __m128 sumas= _mm_set1_ps(0);//suma alineada 1313 iniciadas en 0
    __m128 calculo;
    int i;
    __m128 aux;
    for ( i = 0; i < 100000; i+=4){
        //   multip =1;
        aux = _mm_load_ps(&a[i]);
        //corta el ciclo cuando el arreglo no tiene mas valores validos;
        //   calculo = sqrt(a[i]);
        calculo = _mm_sqrt_ps(aux);//se calcula la raiz cuadrada de los 4 float en paralelo
        calculo = _mm_pow2_ps(calculo,aux);// se decidió que por precicion de calculo se utilizará la funcion pow2
        if(_mm_compare_ps(aux,_mm_set1_ps(0)))break;
        sumas = _mm_add_ps(sumas, calculo);
        
    }
    _mm_store_ps(sumaF, sumas);
    return sumaF[0]+sumaF[1]+sumaF[2]+sumaF[3];
}
void AudioBufferAddWithScale_SSE(const float* aInput, float aScale,
                                 float* aOutput, uint32_t aSize) {
  __m128 vin0, vin1, vin2, vin3, vscaled0, vscaled1, vscaled2, vscaled3, vout0,
      vout1, vout2, vout3, vgain;

  ASSERT_ALIGNED16(aInput);
  ASSERT_ALIGNED16(aOutput);
  ASSERT_MULTIPLE16(aSize);

  vgain = _mm_load1_ps(&aScale);

  for (unsigned i = 0; i < aSize; i += 16) {
    vin0 = _mm_load_ps(&aInput[i]);
    vin1 = _mm_load_ps(&aInput[i + 4]);
    vin2 = _mm_load_ps(&aInput[i + 8]);
    vin3 = _mm_load_ps(&aInput[i + 12]);

    vscaled0 = _mm_mul_ps(vin0, vgain);
    vscaled1 = _mm_mul_ps(vin1, vgain);
    vscaled2 = _mm_mul_ps(vin2, vgain);
    vscaled3 = _mm_mul_ps(vin3, vgain);

    vin0 = _mm_load_ps(&aOutput[i]);
    vin1 = _mm_load_ps(&aOutput[i + 4]);
    vin2 = _mm_load_ps(&aOutput[i + 8]);
    vin3 = _mm_load_ps(&aOutput[i + 12]);

    vout0 = _mm_add_ps(vin0, vscaled0);
    vout1 = _mm_add_ps(vin1, vscaled1);
    vout2 = _mm_add_ps(vin2, vscaled2);
    vout3 = _mm_add_ps(vin3, vscaled3);

    _mm_store_ps(&aOutput[i], vout0);
    _mm_store_ps(&aOutput[i + 4], vout1);
    _mm_store_ps(&aOutput[i + 8], vout2);
    _mm_store_ps(&aOutput[i + 12], vout3);
  }
}
示例#4
0
文件: simd.c 项目: krakjoe/SIMD
PHP_METHOD(Float32x4, __construct) {
	double lanes[4] = php_float32x4_empty;
	float  flanes[4] = php_float32x4_empty;
	php_float32x4_t *p = php_float32x4_fetch();
	
	if (zend_parse_parameters_throw(ZEND_NUM_ARGS(), "dddd", &lanes[0], &lanes[1], &lanes[2], &lanes[3]) != SUCCESS) {
		return;
	}
	
	flanes[0] = (float) lanes[0];
	flanes[1] = (float) lanes[1];
	flanes[2] = (float) lanes[2];
	flanes[3] = (float) lanes[3];
	
	if (posix_memalign(
		(void**) &p->v, 16, sizeof(__m128)) != SUCCESS) {
		zend_throw_exception_ex(php_float32x4_exception_ce, 0, "memory alignment error");
	}
	
	*p->v = _mm_load_ps (flanes);
}
示例#5
0
void 
nv_vector_normalize_L2(nv_matrix_t *v, int vm)
{
#if NV_ENABLE_SSE2
	{
		const int i_lp = (v->n & 0xfffffffc);
		__m128 x, u;
		NV_ALIGNED(float, mm[4], 16);
		int i;
		float dp;
		
		u = _mm_setzero_ps();
		for (i = 0; i < i_lp; i += 4) {
			x = _mm_load_ps(&NV_MAT_V(v, vm, i));
			u = _mm_add_ps(u, _mm_mul_ps(x, x));
		}
		_mm_store_ps(mm, u);
		dp = mm[0] + mm[1] + mm[2] + mm[3];
		for (i = i_lp; i < v->n; ++i) {
			dp += NV_MAT_V(v, vm, i) * NV_MAT_V(v, vm, i);
		}
		if (dp > 0.0f) {
			x = _mm_set1_ps(1.0f / sqrtf(dp));
			for (i = 0; i < i_lp; i += 4) {
				_mm_store_ps(&NV_MAT_V(v, vm, i),
							 _mm_mul_ps(*(const __m128*)&NV_MAT_V(v, vm, i), x));
			}
			for (i = i_lp; i < v->n; ++i) {
				NV_MAT_V(v, vm, i) *= dp;
			}
		}
	}
#else
	float norm = nv_vector_norm(v, vm);
	if (norm > 0.0f) {
		float scale = 1.0f / norm;
		nv_vector_muls(v, vm, v, vm, scale);
	}
#endif
}
示例#6
0
Float evalFourier(const float *coeffs, size_t nCoeffs, Float phi) {
    #if FOURIER_SCALAR == 1
        double cosPhi      = std::cos((double) phi),
               cosPhi_prev = cosPhi,
               cosPhi_cur  = 1.0,
               value       = 0.0;

        for (size_t i=0; i<nCoeffs; ++i) {
            value += coeffs[i] * cosPhi_cur;

            double cosPhi_next = 2.0*cosPhi*cosPhi_cur - cosPhi_prev;
            cosPhi_prev = cosPhi_cur; cosPhi_cur = cosPhi_next;
        }

        return (Float) value;
    #else
        double cosPhi = std::cos((double) phi);

        __m256d
            cosPhi_prev = _mm256_set1_pd(cosPhi),
            cosPhi_cur  = _mm256_set1_pd(1.0),
            value       = _mm256_set_sd((double) coeffs[0]),
            factorPhi_prev, factorPhi_cur;

        initializeRecurrence(cosPhi, factorPhi_prev, factorPhi_cur);

        for (size_t i=1; i<nCoeffs; i+=4) {
            __m256d coeff = _mm256_cvtps_pd(_mm_load_ps(coeffs+i));

            __m256d cosPhi_next = _mm256_add_pd(_mm256_mul_pd(factorPhi_prev, cosPhi_prev),
                    _mm256_mul_pd(factorPhi_cur,  cosPhi_cur));
            value = _mm256_add_pd(value, _mm256_mul_pd(cosPhi_next, coeff));
            cosPhi_prev = _mm256_splat2_pd(cosPhi_next);
            cosPhi_cur = _mm256_splat3_pd(cosPhi_next);
        }

        return (Float) simd::hadd(value);
    #endif
}
示例#7
0
int
nv_vector_eq(const nv_matrix_t *vec1, int j1, const nv_matrix_t *vec2, int j2)
{
	NV_ASSERT(vec1->n == vec2->n);
	
#if NV_ENABLE_SSE2
	{
		__m128 xmm;
		int i = 0;
		int eq;
		int pk_lp = (vec1->n & 0xfffffffc);
		
		for (i = 0; i < pk_lp; i += 4) {
			xmm = _mm_load_ps(&NV_MAT_V(vec2, j2, i));
			xmm = _mm_cmpneq_ps(xmm, *(const __m128 *)&NV_MAT_V(vec1, j1, i));
			eq = _mm_movemask_ps(xmm);
			if (eq != 0) {
				return 0;
			}
		}
		for (i = pk_lp; i < vec1->n; ++i) {
			if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) {			
				return 0;
			}
		}
		return 1;
	}
#else
	{
		int i;
		for (i = 0; i < vec1->n; ++i) {
			if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) {
				return 0;
			}
		}
		return 1;
	}
#endif
}
示例#8
0
文件: Play.cpp 项目: zhangce/nn
float vsum(const float *a, int _n)
{
    float sum;
    int n = _n - _n%3;
    __m128 vsum = _mm_set1_ps(0.0f);
    assert((n & 3) == 0);
    assert(((uintptr_t)a & 15) == 0);
    for (int i = 0; i < n; i += 4)
    {
        __m128 v = _mm_load_ps(&a[i]);
        vsum = _mm_add_ps(vsum, v);
    }
    vsum = _mm_hadd_ps(vsum, vsum);
    vsum = _mm_hadd_ps(vsum, vsum);
    _mm_store_ss(&sum, vsum);

    for(int i=n;i<_n;i++){
    	sum += a[i];
    }

    return sum;
}
示例#9
0
void HighPassFilter::setFlaggedValuesToZeroAndMakeWeightsSSE(const Image2DCPtr &inputImage, const Image2DPtr &outputImage, const Mask2DCPtr &inputMask, const Image2DPtr &weightsOutput)
{
	const size_t width = inputImage->Width();
	const __m128i zero4i = _mm_set_epi32(0, 0, 0, 0);
	const __m128 zero4 = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
	const __m128 one4 = _mm_set_ps(1.0, 1.0, 1.0, 1.0);
	for(size_t y=0;y<inputImage->Height();++y)
	{
		const bool *rowPtr = inputMask->ValuePtr(0, y);
		const float *inputPtr = inputImage->ValuePtr(0, y);
		float *outputPtr = outputImage->ValuePtr(0, y);
		float *weightsPtr = weightsOutput->ValuePtr(0, y);
		const float *end = inputPtr + width;
		while(inputPtr < end)
		{
			
			// Assign each integer to one bool in the mask
			// Convert false to 0xFFFFFFFF and true to 0
			__m128 conditionMask = _mm_castsi128_ps(
				_mm_cmpeq_epi32(_mm_set_epi32(rowPtr[3] || !isfinite(inputPtr[3]), rowPtr[2] || !isfinite(inputPtr[2]),
																			rowPtr[1] || !isfinite(inputPtr[1]), rowPtr[0] || !isfinite(inputPtr[0])),
												zero4i));
			
			_mm_store_ps(weightsPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, one4),
				_mm_andnot_ps(conditionMask, zero4)
			));
			_mm_store_ps(outputPtr, _mm_or_ps(
				_mm_and_ps(conditionMask, _mm_load_ps(inputPtr)),
				_mm_andnot_ps(conditionMask, zero4)
			));
			
			rowPtr += 4;
			outputPtr += 4;
			inputPtr += 4;
			weightsPtr += 4;
		}
	}
}
示例#10
0
static inline long
conv_yF_gamma_yF_linear (const float *src, float *dst, long samples)
{
  long total = samples;

  const __v4sf *s = (const __v4sf*)src;
        __v4sf *d = (__v4sf*)dst;

  if (((uintptr_t)src % 16) + ((uintptr_t)dst % 16) == 0)
    {
      while (samples > 4)
        {
          __v4sf rgba0 = _mm_load_ps ((float *)s++);
          rgba0 = gamma_2_2_to_linear_sse2 (rgba0);
          _mm_store_ps ((float *)d++, rgba0);
          samples -= 4;
        }
    }
  else
    {
      while (samples > 4)
        {
          __v4sf rgba0 = _mm_loadu_ps ((float *)s++);
          rgba0 = gamma_2_2_to_linear_sse2 (rgba0);
          _mm_storeu_ps ((float *)d++, rgba0);
          samples -= 4;
        }
    }

  src = (const float *)s;
  dst = (float *)d;

  while (samples--)
    {
      *dst++ = babl_gamma_2_2_to_linear (*src++);
    }

  return total;
}
示例#11
0
void pow_fmath(const Mat& src, const float a, Mat & dest)
{
    if (dest.empty())dest.create(src.size(), CV_32F);

    int width = src.cols;
    int height = src.rows;

    int size = src.size().area();
    int i = 0;

    const float* s = src.ptr<float>(0);
    float* d = dest.ptr<float>(0);
    const __m128 ma = _mm_set1_ps(a);
    for (i = 0; i <= size - 4; i += 4)
    {
        _mm_store_ps(d + i, _mm_pow_ps(_mm_load_ps(s + i), ma));
    }
    for (; i < size; i++)
    {
        d[i] = cv::pow(s[i], a);
    }
}
示例#12
0
inline static void histogram_helper_cs_rgb_helper_process_pixel_m128(
    const dt_dev_histogram_collection_params_t *const histogram_params, const float *pixel, uint32_t *histogram)
{
  const __m128 scale = _mm_set1_ps(histogram_params->mul);
  const __m128 val_min = _mm_setzero_ps();
  const __m128 val_max = _mm_set1_ps(histogram_params->bins_count - 1);

  assert(dt_is_aligned(pixel, 16));
  const __m128 input = _mm_load_ps(pixel);
  const __m128 scaled = _mm_mul_ps(input, scale);
  const __m128 clamped = _mm_max_ps(_mm_min_ps(scaled, val_max), val_min);

  const __m128i indexes = _mm_cvtps_epi32(clamped);

  __m128i values __attribute__((aligned(16)));
  _mm_store_si128(&values, indexes);

  const uint32_t *valuesi = (uint32_t *)(&values);

  histogram[4 * valuesi[0]]++;
  histogram[4 * valuesi[1] + 1]++;
  histogram[4 * valuesi[2] + 2]++;
}
示例#13
0
void MultiplyAudioBuffer(float *buffer, int totalFloats, float mulVal)
{
    float sum = 0.0f;
    int totalFloatsStore = totalFloats;

    if((UPARAM(buffer) & 0xF) == 0)
    {
        UINT alignedFloats = totalFloats & 0xFFFFFFFC;
        __m128 sseMulVal = _mm_set_ps1(mulVal);

        for(UINT i=0; i<alignedFloats; i += 4)
        {
            __m128 sseScaledVals = _mm_mul_ps(_mm_load_ps(buffer+i), sseMulVal);
            _mm_store_ps(buffer+i, sseScaledVals);
        }

        buffer      += alignedFloats;
        totalFloats -= alignedFloats;
    }

    for(int i=0; i<totalFloats; i++)
        buffer[i] *= mulVal;
}
示例#14
0
int main() {
	const uint32_t len = 16 * 1024 * 1024;
	uint32_t i;

	uint8_t* p = (uint8_t*)heap_new(len);
	if (!p) {
		printf("out of memory!\n");
		return 1;
	}

	printf("Test 16bit stores\n");
	for (i = 0;i < len;i += 2)	
		*(uint16_t*)&p[i] = i & 0xffff;

	printf("Test 32bit stores\n");
	for (i = 0;i < len;i += 4)	
		*(uint32_t*)&p[i] = i;

	printf("Test 64bit stores\n");
	for (i = 0;i < len;i += 8)	
		*(uint64_t*)&p[i] = i;
		

	printf("Test 128bit loads/stores\n");

	__m128 v ;
	
 	for (i = 0;i < len;i += 16)	{
		v = _mm_load_ps((float*)(p + i));
		_mm_store_ps((float*)(p + i),v);
	}

	heap_delete(p);

	printf("Got no unaligned addr exception!\n");
	return 0;
}
示例#15
0
// calculate 2^n for each input sample.
// 
void MLProcExp2::process(const int frames)
{
	static const ml::Symbol preciseSym("precise");
	const MLSignal& x1 = getInput(1);
	MLSignal& y1 = getOutput();
	
	if (mParamsChanged) 
	{
		mPrecise = getParam(preciseSym);
		mParamsChanged = false;
	}	

	if(mPrecise) // scalar code
	{
		for (int n=0; n<frames; ++n)
		{
			y1[n] = pow(2.f, x1[n]);
		}
	}
	else 
	{
		const MLSample* px1 = x1.getConstBuffer();
		MLSample* py1 = y1.getBuffer();
			
		int c = frames >> kMLSamplesPerSSEVectorBits;
		__m128 vx1, vr; 	
		
		for (int n = 0; n < c; ++n)
		{
			vx1 = _mm_load_ps(px1);		
			vr = exp2Approx4(vx1);
			_mm_store_ps(py1, vr);
			px1 += kSSEVecSize;
			py1 += kSSEVecSize;
		}
	}
}
示例#16
0
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  dt_iop_exposure_data_t *d = (dt_iop_exposure_data_t *)piece->data;
  dt_iop_exposure_gui_data_t *g = (dt_iop_exposure_gui_data_t *)self->gui_data;

  if(d->mode == EXPOSURE_MODE_DEFLICKER)
  {
    commit_params_late(self, piece);
  }

  const float black = d->black;
  const float white = exposure2white(d->exposure);
  const int ch = piece->colors;
  const float scale = 1.0/(white - black);
  const __m128 blackv = _mm_set1_ps(black);
  const __m128 scalev = _mm_set1_ps(scale);
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(roi_out,i,o) schedule(static)
#endif
  for(int k=0; k<roi_out->height; k++)
  {
    const float *in = ((float *)i) + (size_t)ch*k*roi_out->width;
    float *out = ((float *)o) + (size_t)ch*k*roi_out->width;
    for (int j=0; j<roi_out->width; j++,in+=4,out+=4)
      _mm_store_ps(out, (_mm_load_ps(in)-blackv)*scalev);
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(i, o, roi_out->width, roi_out->height);

  for(int k=0; k<3; k++) piece->pipe->processed_maximum[k] *= scale;

  if(g != NULL && self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW)
  {
    g->deflicker_computed_exposure = d->exposure;
  }
}
示例#17
0
int main(int argc, const char * argv[])
{
    ALIGN32 float a1[ 16 ] = {
        0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15
    };
    ALIGN32 float a2[ 16 ] = {
        15, 12, 4, 7, 9, 0, 3, 13, 6, 10, 1, 8, 5, 11, 2, 14
    };
    ALIGN32 float aout[ 16 ];
    
    __m128 x1[ 4 ] = { _mm_load_ps( a1 ), _mm_load_ps( a1 + 4 ), _mm_load_ps( a1 + 8 ), _mm_load_ps( a1 + 12 ) };
    __m128 x2[ 4 ] = { _mm_load_ps( a2 ), _mm_load_ps( a2 + 4 ), _mm_load_ps( a2 + 8 ), _mm_load_ps( a2 + 12 ) };
    __m128 xout[ 4 ];
    
    __m256 y1[ 2 ] = { _mm256_load_ps( a1 ), _mm256_load_ps( a1 + 8 ) };
    __m256 y2[ 2 ] = { _mm256_load_ps( a2 ), _mm256_load_ps( a2 + 8 ) };
    __m256 yout[ 2 ];
    
    std::cout << "FPU Mult" << std::endl;
    mul( a1, a2, aout );
    trace( aout, 4 );
    
    std::cout << "SSE Mult" << std::endl;
    mulX4( x1, x2, xout );
    trace( xout, 4 );
    
    std::cout << "AVX2 Mult" << std::endl;
    mulX8( y1, y2, yout );
    trace( yout, 4 );
    
    std::cout << "FPU Transpose" << std::endl;
    transpose( a1, aout );
    trace( aout, 4 );
    
    std::cout << "SSE Transpose" << std::endl;
    transposeX4( x1, xout );
    trace( xout, 4 );
    
    std::cout << "AVX Transpose" << std::endl;
    transposeX8( y1, yout );
    trace( yout, 4 );
    
    return 0;
}
示例#18
0
void 
nv_vector_adds(nv_matrix_t *vec0, int m0,
			  const nv_matrix_t *vec1, int m1,
			   float v)
{
	NV_ASSERT(vec1->n == vec0->n);
	
#if NV_ENABLE_SSE2
	{
		__m128 vv;
		int n;
		int pk_lp = (vec1->n & 0xfffffffc);
		
		vv = _mm_set1_ps(v);

#ifdef _OPENMP
//#pragma omp parallel for
#endif
		for (n = 0; n < pk_lp; n += 4) {
			__m128 x = _mm_load_ps(&NV_MAT_V(vec1, m1, n));
			_mm_store_ps(&NV_MAT_V(vec0, m0, n),
						 _mm_add_ps(x, vv));
		}
		for (n = pk_lp; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + v;
		}
	}
#else
	{
		int n;
		for (n = 0; n < vec1->n; ++n) {
			NV_MAT_V(vec0, m0, n) = NV_MAT_V(vec1, m1, n) + v;
		}
	}
#endif
}
    void operator()(CONTAINER *target, const CONTAINER& oldSelf, const CONTAINER& neighbor)
    {
        __m128 forceOffset = _mm_set1_ps(FORCE_OFFSET);
#ifndef NO_OMP
#pragma omp parallel for schedule(static)
#endif
        for (int j = 0; j < CONTAINER_SIZE; ++j) {
            __m128 neighborPosX = _mm_set1_ps(neighbor.posX[j]);
            __m128 neighborPosY = _mm_set1_ps(neighbor.posY[j]);
            __m128 neighborPosZ = _mm_set1_ps(neighbor.posZ[j]);

            for (int i = 0; i < CONTAINER_SIZE; i+=4) {
                __m128 oldSelfPosX = _mm_load_ps(oldSelf.posX + i);
                __m128 oldSelfPosY = _mm_load_ps(oldSelf.posY + i);
                __m128 oldSelfPosZ = _mm_load_ps(oldSelf.posZ + i);
                __m128 myVelX = _mm_load_ps(oldSelf.velX + i);
                __m128 myVelY = _mm_load_ps(oldSelf.velY + i);
                __m128 myVelZ = _mm_load_ps(oldSelf.velZ + i);

                __m128 deltaX = _mm_sub_ps(oldSelfPosX, neighborPosX);
                __m128 deltaY = _mm_sub_ps(oldSelfPosY, neighborPosY);
                __m128 deltaZ = _mm_sub_ps(oldSelfPosZ, neighborPosZ);
                __m128 dist2 = _mm_add_ps(forceOffset,
                                          _mm_mul_ps(deltaX, deltaX));
                dist2 = _mm_add_ps(dist2,
                                   _mm_mul_ps(deltaY, deltaY));
                dist2 = _mm_add_ps(dist2,
                                   _mm_mul_ps(deltaZ, deltaZ));
                __m128 force = _mm_rsqrt_ps(dist2);
                myVelX = _mm_add_ps(myVelX, _mm_mul_ps(force, deltaX));
                myVelY = _mm_add_ps(myVelY, _mm_mul_ps(force, deltaY));
                myVelZ = _mm_add_ps(myVelZ, _mm_mul_ps(force, deltaZ));
                
                _mm_store_ps(target->velX + i, myVelX);
                _mm_store_ps(target->velY + i, myVelY);
                _mm_store_ps(target->velZ + i, myVelZ);
            }
        }
    }
// =============================================================================
//
// sse3_vChirpData
// version by: Alex Kan
//   http://tbp.berkeley.edu/~alexkan/seti/
//
int sse3_ChirpData_ak(
  sah_complex * cx_DataArray,
  sah_complex * cx_ChirpDataArray,
  int chirp_rate_ind,
  double chirp_rate,
  int  ul_NumDataPoints,
  double sample_rate
) {
  int i;

  if (chirp_rate_ind == 0) {
    memcpy(cx_ChirpDataArray, cx_DataArray,  (int)ul_NumDataPoints * sizeof(sah_complex)  );
    return 0;
  }

  int vEnd;  
  double srate = chirp_rate * 0.5 / (sample_rate * sample_rate);
  __m128d rate = _mm_set1_pd(chirp_rate * 0.5 / (sample_rate * sample_rate));
  __m128d roundVal = _mm_set1_pd(srate >= 0.0 ? TWO_TO_52 : -TWO_TO_52);

  // main vectorised loop
  vEnd = ul_NumDataPoints - (ul_NumDataPoints & 3);
  for (i = 0; i < vEnd; i += 4) {
    const float *data = (const float *) (cx_DataArray + i);
    float *chirped = (float *) (cx_ChirpDataArray + i);
    __m128d di = _mm_set1_pd(i);
    __m128d a1 = _mm_add_pd(_mm_set_pd(1.0, 0.0), di);
    __m128d a2 = _mm_add_pd(_mm_set_pd(3.0, 2.0), di);

    __m128 d1, d2;
    __m128 cd1, cd2;
    __m128 td1, td2;
    __m128 x;
    __m128 y;
    __m128 s;
    __m128 c;
    __m128 m;

    // load the signal to be chirped
    prefetchnta((const void *)( data+32 ));
    d1 = _mm_load_ps(data);
    d2 = _mm_load_ps(data+4);

    // calculate the input angle
    a1 = _mm_mul_pd(_mm_mul_pd(a1, a1), rate);
    a2 = _mm_mul_pd(_mm_mul_pd(a2, a2), rate);

    // reduce the angle to the range (-0.5, 0.5)
    a1 = _mm_sub_pd(a1, _mm_sub_pd(_mm_add_pd(a1, roundVal), roundVal));
    a2 = _mm_sub_pd(a2, _mm_sub_pd(_mm_add_pd(a2, roundVal), roundVal));

    // convert pair of packed double into packed single
    x = _mm_movelh_ps(_mm_cvtpd_ps(a1), _mm_cvtpd_ps(a2));

    // square to the range [0, 0.25)
    y = _mm_mul_ps(x, x);

    // perform the initial polynomial approximations
    s = _mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, SS4),
                                    SS3),
                                y),
                          SS2),
                    y),
              SS1),
          x);
    c = _mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(_mm_add_ps(_mm_mul_ps(y, CC3),
                                CC2),
                          y),
                    CC1),
              y),
          ONE);

    // perform first angle doubling
    x = _mm_sub_ps(_mm_mul_ps(c, c), _mm_mul_ps(s, s));
    y = _mm_mul_ps(_mm_mul_ps(s, c), TWO);

    // calculate scaling factor to correct the magnitude
    //      m1 = vec_nmsub(y1, y1, vec_nmsub(x1, x1, TWO));
    //      m2 = vec_nmsub(y2, y2, vec_nmsub(x2, x2, TWO));
    m = vec_recip3(_mm_add_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y)));

    // perform second angle doubling
    c = _mm_sub_ps(_mm_mul_ps(x, x), _mm_mul_ps(y, y));
    s = _mm_mul_ps(_mm_mul_ps(y, x), TWO);

    // correct the magnitude (final sine / cosine approximations)
    s = _mm_mul_ps(s, m);
    c = _mm_mul_ps(c, m);

    // chirp the data
    cd1 = _mm_shuffle_ps(c, c, 0x50);
    cd2 = _mm_shuffle_ps(c, c, 0xfa);
    cd1 = _mm_mul_ps(cd1, d1);
    cd2 = _mm_mul_ps(cd2, d2);
    d1 = _mm_shuffle_ps(d1, d1, 0xb1);
    d2 = _mm_shuffle_ps(d2, d2, 0xb1);
    td1 = _mm_shuffle_ps(s, s, 0x50);
    td2 = _mm_shuffle_ps(s, s, 0xfa);
    td1 = _mm_mul_ps(td1, d1);
    td2 = _mm_mul_ps(td2, d2);
    cd1 = _mm_addsub_ps(cd1, td1);
    cd2 = _mm_addsub_ps(cd2, td2);

    // store chirped values
    _mm_stream_ps(chirped, cd1);
    _mm_stream_ps(chirped+4, cd2);
  }
  _mm_sfence();

  // handle tail elements with scalar code
  for (   ; i < ul_NumDataPoints; ++i) {
    double angle = srate * i * i * 0.5;
    double s = sin(angle);
    double c = cos(angle);
    float re = cx_DataArray[i][0];
    float im = cx_DataArray[i][1];

    cx_ChirpDataArray[i][0] = re * c - im * s;
    cx_ChirpDataArray[i][1] = re * s + im * c;
  }
  analysis_state.FLOP_counter+=12.0*ul_NumDataPoints;

  return 0;
}
void alphaBlend(const Mat& src1, const Mat& src2, const Mat& alpha,Mat& dest)
{
	int T;
	Mat s1,s2;
	if(src1.channels()<=src2.channels())T=src2.type();
	else T=src1.type();
	if(dest.empty())dest=Mat::zeros(src1.size(),T);
	if(src1.channels()==src2.channels())
	{
		s1=src1;
		s2=src2;
	}
	else if(src2.channels()==3)
	{
		cvtColor(src1,s1,CV_GRAY2BGR);
		s2=src2;
	}
	else
	{
		cvtColor(src2,s2,CV_GRAY2BGR);
		s1=src1;
	}
	Mat a;
	if(alpha.type()==CV_8U)
		alpha.convertTo(a,CV_32F,1.0/255.0);
	else if(alpha.type()==CV_32F || alpha.type()==CV_64F)
		alpha.convertTo(a,CV_32F);

	if(dest.channels()==3)
	{
		vector<Mat> ss1,ss2;
		vector<Mat> ss1f(3),ss2f(3);
		split(s1,ss1);
		split(s2,ss2);	
		for(int c=0;c<3;c++)
		{
			ss1[c].convertTo(ss1f[c],CV_32F);
			ss2[c].convertTo(ss2f[c],CV_32F);
		}
		{
			float* s1r = ss1f[0].ptr<float>(0);
			float* s2r = ss2f[0].ptr<float>(0);

			float* s1g = ss1f[1].ptr<float>(0);
			float* s2g = ss2f[1].ptr<float>(0);

			float* s1b = ss1f[2].ptr<float>(0);
			float* s2b = ss2f[2].ptr<float>(0);


			float* al = a.ptr<float>(0);
			const int size = src1.size().area()/4;

			const __m128 ones = _mm_set1_ps(1.0f);

			for(int i=size;i--;)
			{
				const __m128 msa = _mm_load_ps(al);
				const __m128 imsa = _mm_sub_ps(ones,msa);
				__m128 ms1 = _mm_load_ps(s1r);
				__m128 ms2 = _mm_load_ps(s2r);
				ms1 = _mm_mul_ps(ms1,msa);
				ms2 = _mm_mul_ps(ms2,imsa);
				ms1 = _mm_add_ps(ms1,ms2);
				_mm_store_ps(s1r,ms1);//store ss1f

				ms1 = _mm_load_ps(s1g);
				ms2 = _mm_load_ps(s2g);
				ms1 = _mm_mul_ps(ms1,msa);
				ms2 = _mm_mul_ps(ms2,imsa);
				ms1 = _mm_add_ps(ms1,ms2);
				_mm_store_ps(s1g,ms1);//store ss1f

				ms1 = _mm_load_ps(s1b);
				ms2 = _mm_load_ps(s2b);
				ms1 = _mm_mul_ps(ms1,msa);
				ms2 = _mm_mul_ps(ms2,imsa);
				ms1 = _mm_add_ps(ms1,ms2);
				_mm_store_ps(s1b,ms1);//store ss1f

				al+=4,s1r+=4,s2r+=4,s1g+=4,s2g+=4,s1b+=4,s2b+=4;
			}
			for(int c=0;c<3;c++)
			{
				ss1f[c].convertTo(ss1[c],CV_8U);
			}
			merge(ss1,dest);
		}
	}
	else if(dest.channels()==1)
	{
		Mat ss1f,ss2f;
		s1.convertTo(ss1f,CV_32F);
		s2.convertTo(ss2f,CV_32F);
		{
			float* s1r = ss1f.ptr<float>(0);
			float* s2r = ss2f.ptr<float>(0);
			float* al = a.ptr<float>(0);
			const int size = src1.size().area()/4;
			const int nn = src1.size().area() - size*4;
			const __m128 ones = _mm_set1_ps(1.0f);
			for(int i=size;i--;)
			{
				const __m128 msa = _mm_load_ps(al);
				const __m128 imsa = _mm_sub_ps(ones,msa);
				__m128 ms1 = _mm_load_ps(s1r);
				__m128 ms2 = _mm_load_ps(s2r);
				ms1 = _mm_mul_ps(ms1,msa);
				ms2 = _mm_mul_ps(ms2,imsa);
				ms1 = _mm_add_ps(ms1,ms2);
				_mm_store_ps(s1r,ms1);//store ss1f

				al+=4,s1r+=4,s2r+=4;
			}
			for(int i=nn;i--;)
			{
				*s1r = *al * *s1r + (1.0f-*al)* *s2r;
				al++,s1r++,s2r++;
			}
			ss1f.convertTo(dest,CV_8U);
		}
	}
}
示例#22
0
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  float *in;
  float *out;
  dt_iop_zonesystem_gui_data_t *g = NULL;
  dt_iop_zonesystem_data_t *data = (dt_iop_zonesystem_data_t*)piece->data;

  guchar *buffer = NULL;
  if( self->dev->gui_attached && piece->pipe->type == DT_DEV_PIXELPIPE_PREVIEW )
  {
    g = (dt_iop_zonesystem_gui_data_t *)self->gui_data;
    dt_pthread_mutex_lock(&g->lock);
    if(g->preview_buffer)
      g_free (g->preview_buffer);

    buffer = g->preview_buffer = g_malloc (roi_in->width*roi_in->height);
    g->preview_width=roi_out->width;
    g->preview_height=roi_out->height;
  }

  /* calculate zonemap */
  const int size = data->size;
  float zonemap[MAX_ZONE_SYSTEM_SIZE]= {-1};
  _iop_zonesystem_calculate_zonemap (data, zonemap);
  const int ch = piece->colors;

  /* if gui and have buffer lets gaussblur and fill buffer with zone indexes */
  if( self->dev->gui_attached && g && buffer)
  {
    /* setup gaussian kernel */
    const int radius = 8;
    const int rad = MIN(radius, ceilf(radius * roi_in->scale / piece->iscale));
    const int wd = 2*rad+1;
    float mat[wd*wd];
    float *m;
    const float sigma2 = (2.5*2.5)*(radius*roi_in->scale/piece->iscale)*(radius*roi_in->scale/piece->iscale);
    float weight = 0.0f;

    memset(mat, 0, wd*wd*sizeof(float));

    m = mat;
    for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++)
        weight += *m = expf(- (l*l + k*k)/(2.f*sigma2));
    m = mat;
    for(int l=-rad; l<=rad; l++) for(int k=-rad; k<=rad; k++,m++)
        *m /= weight;

    /* gauss blur the L channel */
#ifdef _OPENMP
    #pragma omp parallel for default(none) private(in, out, m) shared(mat, ivoid, ovoid, roi_out, roi_in) schedule(static)
#endif
    for(int j=rad; j<roi_out->height-rad; j++)
    {
      in  = ((float *)ivoid) + ch*(j*roi_in->width  + rad);
      out = ((float *)ovoid) + ch*(j*roi_out->width + rad);
      for(int i=rad; i<roi_out->width-rad; i++)
      {
        for(int c=0; c<3; c++) out[c] = 0.0f;
        float sum = 0.0;
        m = mat;
        for(int l=-rad; l<=rad; l++)
        {
          float *inrow = in + ch*(l*roi_in->width-rad);
          for(int k=-rad; k<=rad; k++,inrow+=ch,m++)
            sum += *m * inrow[0];
        }
        out[0] = sum;
        out += ch;
        in += ch;
      }
    }

    /* create zonemap preview */
//     in  = (float *)ivoid;
    out = (float *)ovoid;
#ifdef _OPENMP
    #pragma omp parallel for default(none) shared(roi_out,out,buffer,g,zonemap) schedule(static)
#endif
    for (int k=0; k<roi_out->width*roi_out->height; k++)
    {
      buffer[k] = _iop_zonesystem_zone_index_from_lightness (out[ch*k]/100.0f, zonemap, size);
    }

    dt_pthread_mutex_unlock(&g->lock);
  }

  /* process the image */
  in  = (float *)ivoid;
  out = (float *)ovoid;

  const float rzscale = (size-1)/100.0f;

  float zonemap_offset[MAX_ZONE_SYSTEM_SIZE]= {-1};
  float zonemap_scale[MAX_ZONE_SYSTEM_SIZE]= {-1};

  // precompute scale and offset
  for (int k=0; k < size-1; k++) zonemap_scale[k]  = (zonemap[k+1]-zonemap[k])*(size-1);
  for (int k=0; k < size-1; k++) zonemap_offset[k] = 100.0f * ((k+1)*zonemap[k] - k*zonemap[k+1]) ;

#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(roi_out, in, out, zonemap_scale,zonemap_offset) schedule(static)
#endif
  for (int j=0; j<roi_out->height; j++)
    for (int i=0; i<roi_out->width; i++)
    {
      /* remap lightness into zonemap and apply lightness */
      const float *inp = in + ch*(j*roi_out->width+i);
      float *outp = out + ch*(j*roi_out->width+i);

      const int rz = CLAMPS(inp[0]*rzscale, 0, size-2);  // zone index

      const float zs = ((rz > 0) ? (zonemap_offset[rz]/inp[0]) : 0) + zonemap_scale[rz];

      _mm_stream_ps(outp,_mm_mul_ps(_mm_load_ps(inp),_mm_set1_ps(zs)));
    }

  _mm_sfence();

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
示例#23
0
static void passf2pos_sse_ido(const uint16_t ido, const uint16_t l1, const complex_t *cc,
                              complex_t *ch, const complex_t *wa)
{
    uint16_t i, k, ah, ac;

    for (k = 0; k < l1; k++)
    {
        ah = k*ido;
        ac = 2*k*ido;

        for (i = 0; i < ido; i+=4)
        {
            __m128 m1, m2, m3, m4, m5, m6, m7, m8, m9, m10, m11, m12, m13, m14;
            __m128 m15, m16, m17, m18, m19, m20, m21, m22, m23, m24;
            __m128 w1, w2, w3, w4;

            m1 = _mm_load_ps(&RE(cc[ac+i]));
            m2 = _mm_load_ps(&RE(cc[ac+ido+i]));
            m5 = _mm_load_ps(&RE(cc[ac+i+2]));
            m6 = _mm_load_ps(&RE(cc[ac+ido+i+2]));
            w1 = _mm_load_ps(&RE(wa[i]));
            w3 = _mm_load_ps(&RE(wa[i+2]));

            m3 = _mm_add_ps(m1, m2);
            m15 = _mm_add_ps(m5, m6);

            m4 = _mm_sub_ps(m1, m2);
            m16 = _mm_sub_ps(m5, m6);

            _mm_store_ps(&RE(ch[ah+i]), m3);
            _mm_store_ps(&RE(ch[ah+i+2]), m15);


            w2 = _mm_shuffle_ps(w1, w1, _MM_SHUFFLE(2, 3, 0, 1));
            w4 = _mm_shuffle_ps(w3, w3, _MM_SHUFFLE(2, 3, 0, 1));

            m7 = _mm_mul_ps(m4, w1);
            m17 = _mm_mul_ps(m16, w3);
            m8 = _mm_mul_ps(m4, w2);
            m18 = _mm_mul_ps(m16, w4);

            m9  = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(2, 0, 2, 0));
            m19 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(2, 0, 2, 0));
            m10 = _mm_shuffle_ps(m7, m8, _MM_SHUFFLE(3, 1, 3, 1));
            m20 = _mm_shuffle_ps(m17, m18, _MM_SHUFFLE(3, 1, 3, 1));

            m11 = _mm_add_ps(m9, m10);
            m21 = _mm_add_ps(m19, m20);
            m12 = _mm_sub_ps(m9, m10);
            m22 = _mm_sub_ps(m19, m20);

            m13 = _mm_shuffle_ps(m11, m11, _MM_SHUFFLE(0, 0, 3, 2));
            m23 = _mm_shuffle_ps(m21, m21, _MM_SHUFFLE(0, 0, 3, 2));

            m14 = _mm_unpacklo_ps(m12, m13);
            m24 = _mm_unpacklo_ps(m22, m23);

            _mm_store_ps(&RE(ch[ah+i+l1*ido]), m14);
            _mm_store_ps(&RE(ch[ah+i+2+l1*ido]), m24);
        }
    }
}
示例#24
0
phash phash_for_pixmap(const QPixmap& pixmap) {
  static bool cos_table_initialized = false;

  ALIGN(16, static float cos_table[8][8][32][32]);
  ALIGN(16, float intensity[32][32]);

  if(!cos_table_initialized) {
    cos_table_initialized = true;

    // 32x32 DCT, though we are only interested in the top left 8x8, representing lowest frequencies in the image
    for(int u = 0; u < 8; u++) {
      for(int v = 0; v < 8; v++) {
        for(int y = 0; y < 32; y++) {
          for(int x = 0; x < 32; x++) {
            cos_table[v][u][y][x] = cosf(M_PI / 32.0f * (x + 0.5f) * u)
                                  * cosf(M_PI / 32.0f * (y + 0.5f) * v);
          }
        }
      }
    }
  }

  // Scale down to 32x32
  QImage image = pixmap.scaled(32, 32, Qt::IgnoreAspectRatio, Qt::SmoothTransformation).toImage();

  float dct[64];
  int counter = 0;

  // Convert to grayscale
  const __m128 luminance = _mm_set_ps(.0f, 0.2126f, 0.7152f, 0.0722f);

  for(int y = 0; y < 32; y++) {
    for(int x = 0; x < 32; x++) {
      QRgb pixel = image.pixel(x, y);

      __m128 p = _mm_set_ps(0, qRed(pixel), qGreen(pixel), qBlue(pixel));
      __m128 v = _mm_mul_ps(luminance, p);
      __m128 t = _mm_add_ps(v, _mm_movehl_ps(v, v));
      __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
      _mm_store_ss(&intensity[y][x], sum);
    }
  }

  // DCT
  for(int u = 0; u < 8; u++) {
    for(int v = 0; v < 8; v++) {
      __m128 acc = _mm_setzero_ps();

      for(int y = 0; y < 32; y++) {
        for(int x = 0; x < 32; x+=4) {
          __m128 in = _mm_load_ps(&intensity[y][x]);
          __m128 cos = _mm_load_ps(&cos_table[v][u][y][x]);
          __m128 out = _mm_mul_ps(in, cos);
          acc = _mm_add_ps(out, acc);
        }
      }

      __m128 t = _mm_add_ps(acc, _mm_movehl_ps(acc, acc));
      __m128 sum = _mm_add_ss(t, _mm_shuffle_ps(t, t, 1));
      _mm_store_ss(&dct[counter++], sum);
    }
  }

  // Mean, skip first one
  float mean = 0.0;
  for(int i = 1; i < 64; i++) {
    mean += dct[i];
  }
  mean /= 63;

  // Calculate the final hash
  phash hash = 0;

  for(int i = 0; i < 64; i++) {
    phash val = dct[i] > mean;
    hash |= val << i;
  }

  return hash;
}
示例#25
0
float FFM::wTx(const ffm_node *instance, const unsigned int & size, float kappa, float eta, float lambda, bool do_update)
{
  long long align0 = (long long)parameters.num_factors*2;
  long long align1 = (long long)num_fields*align0;

  __m128 XMMkappa = _mm_set1_ps(kappa);
  __m128 XMMeta = _mm_set1_ps(eta);
  __m128 XMMlambda = _mm_set1_ps(lambda);

  __m128 XMMt = _mm_setzero_ps();

  for( unsigned int n1 = 0; n1 < size; n1++ )
  {
    int j1 = instance[n1].index;
    int f1 = instance[n1].field_index;
    float v1 = instance[n1].value;
    if(j1 >= num_features || f1 >= num_fields)
	continue;

    for( unsigned int n2 = n1 + 1; n2 < size; n2++ )
    {
      int j2 = instance[n2].index;
      int f2 = instance[n2].field_index;
      float v2 = instance[n2].value;
      if(j2 >= num_features || f2 >= num_fields)
	  continue;

      float *w1 = W + j1*align1 + f2*align0;
      float *w2 = W + j2*align1 + f1*align0;

      __m128 XMMv = _mm_set1_ps(v1*v2);

      if(do_update)
      {
	__m128 XMMkappav = _mm_mul_ps(XMMkappa, XMMv);

	float *wg1 = w1 + parameters.num_factors;
	float *wg2 = w2 + parameters.num_factors;
	for(int d = 0; d < parameters.num_factors; d += 4)
	{
	  __m128 XMMw1 = _mm_load_ps(w1+d);
	  __m128 XMMw2 = _mm_load_ps(w2+d);

	  __m128 XMMwg1 = _mm_load_ps(wg1+d);
	  __m128 XMMwg2 = _mm_load_ps(wg2+d);

	  __m128 XMMg1 = _mm_add_ps(_mm_mul_ps(XMMlambda, XMMw1), _mm_mul_ps(XMMkappav, XMMw2));
	  __m128 XMMg2 = _mm_add_ps(_mm_mul_ps(XMMlambda, XMMw2), _mm_mul_ps(XMMkappav, XMMw1));

	  XMMwg1 = _mm_add_ps(XMMwg1, _mm_mul_ps(XMMg1, XMMg1));
	  XMMwg2 = _mm_add_ps(XMMwg2, _mm_mul_ps(XMMg2, XMMg2));

	  XMMw1 = _mm_sub_ps(XMMw1, _mm_mul_ps(XMMeta, _mm_mul_ps(_mm_rsqrt_ps(XMMwg1), XMMg1)));
	  XMMw2 = _mm_sub_ps(XMMw2, _mm_mul_ps(XMMeta, _mm_mul_ps(_mm_rsqrt_ps(XMMwg2), XMMg2)));

	  _mm_store_ps(w1+d, XMMw1);
	  _mm_store_ps(w2+d, XMMw2);

	  _mm_store_ps(wg1+d, XMMwg1);
	  _mm_store_ps(wg2+d, XMMwg2);
	}
      }
      else
      {
	for(int d = 0; d < parameters.num_factors; d += 4)
	{
	  __m128  XMMw1 = _mm_load_ps(w1+d);
	  __m128  XMMw2 = _mm_load_ps(w2+d);

	  XMMt = _mm_add_ps(XMMt, _mm_mul_ps(_mm_mul_ps(XMMw1, XMMw2), XMMv));
	}
      }
    }
  }

  if(do_update)
      return 0;

  XMMt = _mm_hadd_ps(XMMt, XMMt);
  XMMt = _mm_hadd_ps(XMMt, XMMt);
  float t;
  _mm_store_ss(&t, XMMt);

  return t;
}
示例#26
0
void
process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data;
  const int ch = piece->colors;
  const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK);

  if(!isnan(d->cmatrix[0]))
  {
    //fprintf(stderr,"Using cmatrix codepath\n");
    // convert to rgb using matrix
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + ch*roi_in->width *j;
      float *out = (float*)ovoid + ch*roi_out->width*j;
      const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]);
      const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]);
      const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]);

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in));
        const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2)))));

        _mm_stream_ps(out,t);
      }
    }
    _mm_sfence();
    // apply profile
#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid)
#endif
    for(int j=0; j<roi_out->height; j++)
    {

      float *in  = (float*)ivoid + ch*roi_in->width *j;
      float *out = (float*)ovoid + ch*roi_out->width*j;

      for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch )
      {
        for(int i=0; i<3; i++)
          if (d->lut[i][0] >= 0.0f)
          {
            out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]);
          }
      }
    }
  }
  else
  {
    float *in  = (float*)ivoid;
    float *out = (float*)ovoid;
    const int rowsize=roi_out->width * 3;
    //fprintf(stderr,"Using xform codepath\n");

#ifdef _OPENMP
    #pragma omp parallel for schedule(static) default(none) shared(out, roi_out, in)
#endif
    for (int k=0; k<roi_out->height; k++)
    {
      float Lab[rowsize];
      float rgb[rowsize];

      const int m=(k*(roi_out->width*ch));
      for (int l=0; l<roi_out->width; l++)
      {
        int li=3*l,ii=ch*l;
        Lab[li+0] = in[m+ii+0];
        Lab[li+1] = in[m+ii+1];
        Lab[li+2] = in[m+ii+2];
      }

      cmsDoTransform (d->xform, Lab, rgb, roi_out->width);

      for (int l=0; l<roi_out->width; l++)
      {
        int oi=ch*l, ri=3*l;
        if(gamutcheck && (rgb[ri+0] < 0.0f || rgb[ri+1] < 0.0f || rgb[ri+2] < 0.0f))
        {
          out[m+oi+0] = 0.0f;
          out[m+oi+1] = 1.0f;
          out[m+oi+2] = 1.0f;
        }
        else
        {
          out[m+oi+0] = rgb[ri+0];
          out[m+oi+1] = rgb[ri+1];
          out[m+oi+2] = rgb[ri+2];
        }
      }
    }
  }

  if(piece->pipe->mask_display)
    dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height);
}
示例#27
0
void
dt_gaussian_blur_4c(
    dt_gaussian_t *g,
    float    *in,
    float    *out)
{

  const int width = g->width;
  const int height = g->height;
  const int ch = 4;

  assert(g->channels == 4);

  float a0, a1, a2, a3, b1, b2, coefp, coefn;

  compute_gauss_params(g->sigma, g->order, &a0, &a1, &a2, &a3, &b1, &b2, &coefp, &coefn);

  const __m128 Labmax = _mm_set_ps(g->max[3], g->max[2], g->max[1], g->max[0]);
  const __m128 Labmin = _mm_set_ps(g->min[3], g->min[2], g->min[1], g->min[0]);

  float *temp = g->buf;


  // vertical blur column by column
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(in,out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static)
#endif
  for(int i=0; i<width; i++)
  {
    __m128 xp = _mm_setzero_ps();
    __m128 yb = _mm_setzero_ps();
    __m128 yp = _mm_setzero_ps();
    __m128 xc = _mm_setzero_ps();
    __m128 yc = _mm_setzero_ps();
    __m128 xn = _mm_setzero_ps();
    __m128 xa = _mm_setzero_ps();
    __m128 yn = _mm_setzero_ps();
    __m128 ya = _mm_setzero_ps();

    // forward filter
    xp = MMCLAMPPS(_mm_load_ps(in+i*ch), Labmin, Labmax);
    yb = _mm_mul_ps(_mm_set_ps1(coefp), xp);
    yp = yb;

 
    for(int j=0; j<height; j++)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax);


      yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)),
           _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)),
           _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2)))));

      _mm_store_ps(temp+offset, yc);

      xp = xc;
      yb = yp;
      yp = yc;

    }

    // backward filter
    xn = MMCLAMPPS(_mm_load_ps(in+((height - 1) * width + i)*ch), Labmin, Labmax);
    xa = xn;
    yn = _mm_mul_ps(_mm_set_ps1(coefn), xn);
    ya = yn;

    for(int j=height - 1; j > -1; j--)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(in+offset), Labmin, Labmax);

      yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)),
           _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)),
           _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2)))));


      xa = xn; 
      xn = xc; 
      ya = yn; 
      yn = yc;

      _mm_store_ps(temp+offset, _mm_add_ps(_mm_load_ps(temp+offset), yc));
    }
  }

  // horizontal blur line by line
#ifdef _OPENMP
  #pragma omp parallel for default(none) shared(out,temp,a0,a1,a2,a3,b1,b2,coefp,coefn) schedule(static)
#endif
  for(int j=0; j<height; j++)
  {
    __m128 xp = _mm_setzero_ps();
    __m128 yb = _mm_setzero_ps();
    __m128 yp = _mm_setzero_ps();
    __m128 xc = _mm_setzero_ps();
    __m128 yc = _mm_setzero_ps();
    __m128 xn = _mm_setzero_ps();
    __m128 xa = _mm_setzero_ps();
    __m128 yn = _mm_setzero_ps();
    __m128 ya = _mm_setzero_ps();

    // forward filter
    xp = MMCLAMPPS(_mm_load_ps(temp+j*width*ch), Labmin, Labmax);
    yb = _mm_mul_ps(_mm_set_ps1(coefp), xp);
    yp = yb;

 
    for(int i=0; i<width; i++)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax);

      yc = _mm_add_ps(_mm_mul_ps(xc, _mm_set_ps1(a0)),
           _mm_sub_ps(_mm_mul_ps(xp, _mm_set_ps1(a1)),
           _mm_add_ps(_mm_mul_ps(yp, _mm_set_ps1(b1)), _mm_mul_ps(yb, _mm_set_ps1(b2)))));

      _mm_store_ps(out+offset, yc);

      xp = xc;
      yb = yp;
      yp = yc;
    }

    // backward filter
    xn = MMCLAMPPS(_mm_load_ps(temp+((j + 1)*width - 1)*ch), Labmin, Labmax);
    xa = xn;
    yn = _mm_mul_ps(_mm_set_ps1(coefn), xn);
    ya = yn;


    for(int i=width - 1; i > -1; i--)
    {
      int offset = (i + j * width)*ch;

      xc = MMCLAMPPS(_mm_load_ps(temp+offset), Labmin, Labmax);

      yc = _mm_add_ps(_mm_mul_ps(xn, _mm_set_ps1(a2)),
           _mm_sub_ps(_mm_mul_ps(xa, _mm_set_ps1(a3)),
           _mm_add_ps(_mm_mul_ps(yn, _mm_set_ps1(b1)), _mm_mul_ps(ya, _mm_set_ps1(b2)))));


      xa = xn; 
      xn = xc; 
      ya = yn; 
      yn = yc;

      _mm_store_ps(out+offset, _mm_add_ps(_mm_load_ps(out+offset), yc));
    }
  }
}
示例#28
0
static void cftmdl_128_SSE2(float* a) {
  const int l = 8;
  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
  int j0;

  __m128 wk1rv = _mm_load_ps(cftmdl_wk1r);
  for (j0 = 0; j0 < l; j0 += 2) {
    const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
    const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
    const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
    const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
    const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
                                          _mm_castsi128_ps(a_32),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
                                          _mm_castsi128_ps(a_40),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
    const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

    const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
    const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
    const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
    const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
    const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
                                          _mm_castsi128_ps(a_48),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
                                          _mm_castsi128_ps(a_56),
                                          _MM_SHUFFLE(1, 0, 1, 0));
    const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
    const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

    const __m128 xx0 = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
    const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);

    const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
        _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
    const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
    const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
    const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

    const __m128 yy0 =
        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(2, 2, 2, 2));
    const __m128 yy1 =
        _mm_shuffle_ps(x1_x3_add, x1_x3_sub, _MM_SHUFFLE(3, 3, 3, 3));
    const __m128 yy2 = _mm_mul_ps(mm_swap_sign, yy1);
    const __m128 yy3 = _mm_add_ps(yy0, yy2);
    const __m128 yy4 = _mm_mul_ps(wk1rv, yy3);

    _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx0));
    _mm_storel_epi64(
        (__m128i*)&a[j0 + 32],
        _mm_shuffle_epi32(_mm_castps_si128(xx0), _MM_SHUFFLE(3, 2, 3, 2)));

    _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx1));
    _mm_storel_epi64(
        (__m128i*)&a[j0 + 48],
        _mm_shuffle_epi32(_mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 2, 3)));
    a[j0 + 48] = -a[j0 + 48];

    _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(x1_x3_add));
    _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(x1_x3_sub));

    _mm_storel_epi64((__m128i*)&a[j0 + 40], _mm_castps_si128(yy4));
    _mm_storel_epi64(
        (__m128i*)&a[j0 + 56],
        _mm_shuffle_epi32(_mm_castps_si128(yy4), _MM_SHUFFLE(2, 3, 2, 3)));
  }

  {
    int k = 64;
    int k1 = 2;
    int k2 = 2 * k1;
    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2 + 0]);
    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2 + 0]);
    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2 + 0]);
    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2 + 0]);
    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2 + 0]);
    wk1rv = _mm_load_ps(&rdft_wk1r[k2 + 0]);
    for (j0 = k; j0 < l + k; j0 += 2) {
      const __m128i a_00 = _mm_loadl_epi64((__m128i*)&a[j0 + 0]);
      const __m128i a_08 = _mm_loadl_epi64((__m128i*)&a[j0 + 8]);
      const __m128i a_32 = _mm_loadl_epi64((__m128i*)&a[j0 + 32]);
      const __m128i a_40 = _mm_loadl_epi64((__m128i*)&a[j0 + 40]);
      const __m128 a_00_32 = _mm_shuffle_ps(_mm_castsi128_ps(a_00),
                                            _mm_castsi128_ps(a_32),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 a_08_40 = _mm_shuffle_ps(_mm_castsi128_ps(a_08),
                                            _mm_castsi128_ps(a_40),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      __m128 x0r0_0i0_0r1_x0i1 = _mm_add_ps(a_00_32, a_08_40);
      const __m128 x1r0_1i0_1r1_x1i1 = _mm_sub_ps(a_00_32, a_08_40);

      const __m128i a_16 = _mm_loadl_epi64((__m128i*)&a[j0 + 16]);
      const __m128i a_24 = _mm_loadl_epi64((__m128i*)&a[j0 + 24]);
      const __m128i a_48 = _mm_loadl_epi64((__m128i*)&a[j0 + 48]);
      const __m128i a_56 = _mm_loadl_epi64((__m128i*)&a[j0 + 56]);
      const __m128 a_16_48 = _mm_shuffle_ps(_mm_castsi128_ps(a_16),
                                            _mm_castsi128_ps(a_48),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 a_24_56 = _mm_shuffle_ps(_mm_castsi128_ps(a_24),
                                            _mm_castsi128_ps(a_56),
                                            _MM_SHUFFLE(1, 0, 1, 0));
      const __m128 x2r0_2i0_2r1_x2i1 = _mm_add_ps(a_16_48, a_24_56);
      const __m128 x3r0_3i0_3r1_x3i1 = _mm_sub_ps(a_16_48, a_24_56);

      const __m128 xx = _mm_add_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const __m128 xx1 = _mm_sub_ps(x0r0_0i0_0r1_x0i1, x2r0_2i0_2r1_x2i1);
      const __m128 xx2 = _mm_mul_ps(xx1, wk2rv);
      const __m128 xx3 =
          _mm_mul_ps(wk2iv,
                     _mm_castsi128_ps(_mm_shuffle_epi32(
                         _mm_castps_si128(xx1), _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx4 = _mm_add_ps(xx2, xx3);

      const __m128 x3i0_3r0_3i1_x3r1 = _mm_castsi128_ps(_mm_shuffle_epi32(
          _mm_castps_si128(x3r0_3i0_3r1_x3i1), _MM_SHUFFLE(2, 3, 0, 1)));
      const __m128 x3_swapped = _mm_mul_ps(mm_swap_sign, x3i0_3r0_3i1_x3r1);
      const __m128 x1_x3_add = _mm_add_ps(x1r0_1i0_1r1_x1i1, x3_swapped);
      const __m128 x1_x3_sub = _mm_sub_ps(x1r0_1i0_1r1_x1i1, x3_swapped);

      const __m128 xx10 = _mm_mul_ps(x1_x3_add, wk1rv);
      const __m128 xx11 = _mm_mul_ps(
          wk1iv,
          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_add),
                                             _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx12 = _mm_add_ps(xx10, xx11);

      const __m128 xx20 = _mm_mul_ps(x1_x3_sub, wk3rv);
      const __m128 xx21 = _mm_mul_ps(
          wk3iv,
          _mm_castsi128_ps(_mm_shuffle_epi32(_mm_castps_si128(x1_x3_sub),
                                             _MM_SHUFFLE(2, 3, 0, 1))));
      const __m128 xx22 = _mm_add_ps(xx20, xx21);

      _mm_storel_epi64((__m128i*)&a[j0 + 0], _mm_castps_si128(xx));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 32],
          _mm_shuffle_epi32(_mm_castps_si128(xx), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 16], _mm_castps_si128(xx4));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 48],
          _mm_shuffle_epi32(_mm_castps_si128(xx4), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 8], _mm_castps_si128(xx12));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 40],
          _mm_shuffle_epi32(_mm_castps_si128(xx12), _MM_SHUFFLE(3, 2, 3, 2)));

      _mm_storel_epi64((__m128i*)&a[j0 + 24], _mm_castps_si128(xx22));
      _mm_storel_epi64(
          (__m128i*)&a[j0 + 56],
          _mm_shuffle_epi32(_mm_castps_si128(xx22), _MM_SHUFFLE(3, 2, 3, 2)));
    }
  }
}
示例#29
0
static void rftbsub_128_SSE2(float* a) {
  const float* c = rdft_w + 32;
  int j1, j2, k1, k2;
  float wkr, wki, xr, xi, yr, yi;

  static const ALIGN16_BEG float ALIGN16_END
      k_half[4] = {0.5f, 0.5f, 0.5f, 0.5f};
  const __m128 mm_half = _mm_load_ps(k_half);

  a[1] = -a[1];
  // Vectorized code (four at once).
  //    Note: commented number are indexes for the first iteration of the loop.
  for (j1 = 1, j2 = 2; j2 + 7 < 64; j1 += 4, j2 += 8) {
    // Load 'wk'.
    const __m128 c_j1 = _mm_loadu_ps(&c[j1]);       //  1,  2,  3,  4,
    const __m128 c_k1 = _mm_loadu_ps(&c[29 - j1]);  // 28, 29, 30, 31,
    const __m128 wkrt = _mm_sub_ps(mm_half, c_k1);  // 28, 29, 30, 31,
    const __m128 wkr_ =
        _mm_shuffle_ps(wkrt, wkrt, _MM_SHUFFLE(0, 1, 2, 3));  // 31, 30, 29, 28,
    const __m128 wki_ = c_j1;                                 //  1,  2,  3,  4,
    // Load and shuffle 'a'.
    const __m128 a_j2_0 = _mm_loadu_ps(&a[0 + j2]);    //   2,   3,   4,   5,
    const __m128 a_j2_4 = _mm_loadu_ps(&a[4 + j2]);    //   6,   7,   8,   9,
    const __m128 a_k2_0 = _mm_loadu_ps(&a[122 - j2]);  // 120, 121, 122, 123,
    const __m128 a_k2_4 = _mm_loadu_ps(&a[126 - j2]);  // 124, 125, 126, 127,
    const __m128 a_j2_p0 = _mm_shuffle_ps(
        a_j2_0, a_j2_4, _MM_SHUFFLE(2, 0, 2, 0));  //   2,   4,   6,   8,
    const __m128 a_j2_p1 = _mm_shuffle_ps(
        a_j2_0, a_j2_4, _MM_SHUFFLE(3, 1, 3, 1));  //   3,   5,   7,   9,
    const __m128 a_k2_p0 = _mm_shuffle_ps(
        a_k2_4, a_k2_0, _MM_SHUFFLE(0, 2, 0, 2));  // 126, 124, 122, 120,
    const __m128 a_k2_p1 = _mm_shuffle_ps(
        a_k2_4, a_k2_0, _MM_SHUFFLE(1, 3, 1, 3));  // 127, 125, 123, 121,
    // Calculate 'x'.
    const __m128 xr_ = _mm_sub_ps(a_j2_p0, a_k2_p0);
    // 2-126, 4-124, 6-122, 8-120,
    const __m128 xi_ = _mm_add_ps(a_j2_p1, a_k2_p1);
    // 3-127, 5-125, 7-123, 9-121,
    // Calculate product into 'y'.
    //    yr = wkr * xr + wki * xi;
    //    yi = wkr * xi - wki * xr;
    const __m128 a_ = _mm_mul_ps(wkr_, xr_);
    const __m128 b_ = _mm_mul_ps(wki_, xi_);
    const __m128 c_ = _mm_mul_ps(wkr_, xi_);
    const __m128 d_ = _mm_mul_ps(wki_, xr_);
    const __m128 yr_ = _mm_add_ps(a_, b_);  // 2-126, 4-124, 6-122, 8-120,
    const __m128 yi_ = _mm_sub_ps(c_, d_);  // 3-127, 5-125, 7-123, 9-121,
                                            // Update 'a'.
                                            //    a[j2 + 0] = a[j2 + 0] - yr;
                                            //    a[j2 + 1] = yi - a[j2 + 1];
                                            //    a[k2 + 0] = yr + a[k2 + 0];
    //    a[k2 + 1] = yi - a[k2 + 1];
    const __m128 a_j2_p0n = _mm_sub_ps(a_j2_p0, yr_);  //   2,   4,   6,   8,
    const __m128 a_j2_p1n = _mm_sub_ps(yi_, a_j2_p1);  //   3,   5,   7,   9,
    const __m128 a_k2_p0n = _mm_add_ps(a_k2_p0, yr_);  // 126, 124, 122, 120,
    const __m128 a_k2_p1n = _mm_sub_ps(yi_, a_k2_p1);  // 127, 125, 123, 121,
    // Shuffle in right order and store.
    const __m128 a_j2_0n = _mm_unpacklo_ps(a_j2_p0n, a_j2_p1n);
    //   2,   3,   4,   5,
    const __m128 a_j2_4n = _mm_unpackhi_ps(a_j2_p0n, a_j2_p1n);
    //   6,   7,   8,   9,
    const __m128 a_k2_0nt = _mm_unpackhi_ps(a_k2_p0n, a_k2_p1n);
    // 122, 123, 120, 121,
    const __m128 a_k2_4nt = _mm_unpacklo_ps(a_k2_p0n, a_k2_p1n);
    // 126, 127, 124, 125,
    const __m128 a_k2_0n = _mm_shuffle_ps(
        a_k2_0nt, a_k2_0nt, _MM_SHUFFLE(1, 0, 3, 2));  // 120, 121, 122, 123,
    const __m128 a_k2_4n = _mm_shuffle_ps(
        a_k2_4nt, a_k2_4nt, _MM_SHUFFLE(1, 0, 3, 2));  // 124, 125, 126, 127,
    _mm_storeu_ps(&a[0 + j2], a_j2_0n);
    _mm_storeu_ps(&a[4 + j2], a_j2_4n);
    _mm_storeu_ps(&a[122 - j2], a_k2_0n);
    _mm_storeu_ps(&a[126 - j2], a_k2_4n);
  }
  // Scalar code for the remaining items.
  for (; j2 < 64; j1 += 1, j2 += 2) {
    k2 = 128 - j2;
    k1 = 32 - j1;
    wkr = 0.5f - c[k1];
    wki = c[j1];
    xr = a[j2 + 0] - a[k2 + 0];
    xi = a[j2 + 1] + a[k2 + 1];
    yr = wkr * xr + wki * xi;
    yi = wkr * xi - wki * xr;
    a[j2 + 0] = a[j2 + 0] - yr;
    a[j2 + 1] = yi - a[j2 + 1];
    a[k2 + 0] = yr + a[k2 + 0];
    a[k2 + 1] = yi - a[k2 + 1];
  }
  a[65] = -a[65];
}
示例#30
0
static void cft1st_128_SSE2(float* a) {
  const __m128 mm_swap_sign = _mm_load_ps(k_swap_sign);
  int j, k2;

  for (k2 = 0, j = 0; j < 128; j += 16, k2 += 4) {
    __m128 a00v = _mm_loadu_ps(&a[j + 0]);
    __m128 a04v = _mm_loadu_ps(&a[j + 4]);
    __m128 a08v = _mm_loadu_ps(&a[j + 8]);
    __m128 a12v = _mm_loadu_ps(&a[j + 12]);
    __m128 a01v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(1, 0, 1, 0));
    __m128 a23v = _mm_shuffle_ps(a00v, a08v, _MM_SHUFFLE(3, 2, 3, 2));
    __m128 a45v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(1, 0, 1, 0));
    __m128 a67v = _mm_shuffle_ps(a04v, a12v, _MM_SHUFFLE(3, 2, 3, 2));

    const __m128 wk1rv = _mm_load_ps(&rdft_wk1r[k2]);
    const __m128 wk1iv = _mm_load_ps(&rdft_wk1i[k2]);
    const __m128 wk2rv = _mm_load_ps(&rdft_wk2r[k2]);
    const __m128 wk2iv = _mm_load_ps(&rdft_wk2i[k2]);
    const __m128 wk3rv = _mm_load_ps(&rdft_wk3r[k2]);
    const __m128 wk3iv = _mm_load_ps(&rdft_wk3i[k2]);
    __m128 x0v = _mm_add_ps(a01v, a23v);
    const __m128 x1v = _mm_sub_ps(a01v, a23v);
    const __m128 x2v = _mm_add_ps(a45v, a67v);
    const __m128 x3v = _mm_sub_ps(a45v, a67v);
    __m128 x0w;
    a01v = _mm_add_ps(x0v, x2v);
    x0v = _mm_sub_ps(x0v, x2v);
    x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
    {
      const __m128 a45_0v = _mm_mul_ps(wk2rv, x0v);
      const __m128 a45_1v = _mm_mul_ps(wk2iv, x0w);
      a45v = _mm_add_ps(a45_0v, a45_1v);
    }
    {
      __m128 a23_0v, a23_1v;
      const __m128 x3w = _mm_shuffle_ps(x3v, x3v, _MM_SHUFFLE(2, 3, 0, 1));
      const __m128 x3s = _mm_mul_ps(mm_swap_sign, x3w);
      x0v = _mm_add_ps(x1v, x3s);
      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
      a23_0v = _mm_mul_ps(wk1rv, x0v);
      a23_1v = _mm_mul_ps(wk1iv, x0w);
      a23v = _mm_add_ps(a23_0v, a23_1v);

      x0v = _mm_sub_ps(x1v, x3s);
      x0w = _mm_shuffle_ps(x0v, x0v, _MM_SHUFFLE(2, 3, 0, 1));
    }
    {
      const __m128 a67_0v = _mm_mul_ps(wk3rv, x0v);
      const __m128 a67_1v = _mm_mul_ps(wk3iv, x0w);
      a67v = _mm_add_ps(a67_0v, a67_1v);
    }

    a00v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(1, 0, 1, 0));
    a04v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(1, 0, 1, 0));
    a08v = _mm_shuffle_ps(a01v, a23v, _MM_SHUFFLE(3, 2, 3, 2));
    a12v = _mm_shuffle_ps(a45v, a67v, _MM_SHUFFLE(3, 2, 3, 2));
    _mm_storeu_ps(&a[j + 0], a00v);
    _mm_storeu_ps(&a[j + 4], a04v);
    _mm_storeu_ps(&a[j + 8], a08v);
    _mm_storeu_ps(&a[j + 12], a12v);
  }
}