Exemple #1
0
__m128 _mm_pow_ps(__m128 a, __m128 b){//eleva a[i]^b[i] para todo a real, y b entero < 0
    __m128 cero = _mm_set1_ps(0);
    __m128 uno = _mm_set1_ps(1.0);//se declaran las constantes [0,0,0,0] y [1,1,1,1] para ser usadas luego
    
    b = _mm_parteentera_ps(b);// para evitar problemas con decimales.
    b = _mm_sub_ps(b, uno);//b[i]--
    b = _mm_max_ps(b, cero);// se busca el maximo con 0 para luego comparar y saber cuales deben o no deben ser
    
    __m128 activo= _mm_mul_ps(_mm_set1_ps(1.0), b);//elimina de los activos a los exponentes menores que 1
    activo= _mm_min_ps(activo , uno);//con esto se setean en numeros 1 o 0 declarando activos a quienes su exponente sea mayor que 0
   
    __m128 resul= a;//el resultado se iguala a la entrada a 
    __m128 act_x_a;//se declara variable para ser usarda en el bucle
   
    
    while (!_mm_compare_ps(b, cero)) {//luego lo explico
        act_x_a = _mm_mul_ps(a, activo);//se elimina los activos de las variables a multiplicar para poder ejecutar las multiplicaciones en paralelo
        act_x_a = _mm_max_ps(act_x_a, uno);//las variables eliminadas se setean en 1 para que el resultado quede como en el estado anterior
        resul= _mm_mul_ps(resul, act_x_a);// se multiplican en paralelo lo la base por si misma tantas veces como sea indicado por el exponente
        
        b = _mm_sub_ps(b, uno);//b[i]--
        b = _mm_max_ps(b, cero);// en caso de que la resta deje algun valor en negativo este se setea en 0
        
        activo= _mm_mul_ps(activo, b);// se realiza nuevamente la busqueda de los que ya no estan activos  que depende de los valores en los exponentes.
        activo= _mm_min_ps(activo , uno);
        
    }
    
    return resul;// se retorna un vector con los valores pedidos
    
}
Exemple #2
0
static void
minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n)
{
  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    *dest++ = *src1 < *src2 ? *src1 : *src2;
    src1++;
    src2++;
  }
  for (; n >= 8; n -= 8) {
    __m128 xmm0, xmm1;
    xmm0 = _mm_loadu_ps(src1);
    xmm1 = _mm_loadu_ps(src2);
    xmm0 = _mm_min_ps(xmm0, xmm1);
    _mm_store_ps(dest, xmm0);
    xmm0 = _mm_loadu_ps(src1 + 4);
    xmm1 = _mm_loadu_ps(src2 + 4);
    xmm0 = _mm_min_ps(xmm0, xmm1);
    _mm_store_ps(dest + 4, xmm0);
    dest += 8;
    src1 += 8;
    src2 += 8;
  }
  for (; n > 0; n--) {
    *dest++ = *src1 < *src2 ? *src1 : *src2;
    src1++;
    src2++;
  }
}
Exemple #3
0
/*---------------------------------------------------------------------------*/
__m128 TTriangle::NearestTest4(__m128 mask, const TPoint4& orig, __m128 radius, HitResult4* result) const
{
    TPoint4 p0(pos0);
    TPoint4 p1(pos1);
    TPoint4 p2(pos2);

    __m128 u0, u1, u2;

    __m128 d0 = ::CalcDistance4(p0, p1, orig, &u0);
    __m128 d1 = ::CalcDistance4(p1, p2, orig, &u1);
    __m128 d2 = ::CalcDistance4(p2, p0, orig, &u2);

    __m128 minimum = _mm_min_ps(d0, _mm_min_ps(d1, d2));

    __m128 minMask;

    result->u = u0;
    result->v = g_zero4;
    result->t = d0;

    minMask = (minimum == d1);
    result->u = _mm_merge_ps(minMask, result->u, (g_one4 - u1));
    result->v = _mm_merge_ps(minMask, result->v, u1);
    result->t = _mm_merge_ps(minMask, result->t, d1);

    minMask = (minimum == d2);
    result->u = _mm_merge_ps(minMask, result->u, g_zero4);
    result->v = _mm_merge_ps(minMask, result->v, g_one4 - u2);
    result->t = _mm_merge_ps(minMask, result->t, d2);

    return mask & (result->t < radius);
}
Exemple #4
0
/*
====================
idMD5Mesh::CalculateBounds
====================
*/
void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const {

	__m128 minX = vector_float_posInfinity;
	__m128 minY = vector_float_posInfinity;
	__m128 minZ = vector_float_posInfinity;
	__m128 maxX = vector_float_negInfinity;
	__m128 maxY = vector_float_negInfinity;
	__m128 maxZ = vector_float_negInfinity;
	for ( int i = 0; i < numMeshJoints; i++ ) {
		const idJointMat & joint = entJoints[meshJoints[i]];
		__m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 );
		__m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 );
		__m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 );
		minX = _mm_min_ps( minX, x );
		minY = _mm_min_ps( minY, y );
		minZ = _mm_min_ps( minZ, z );
		maxX = _mm_max_ps( maxX, x );
		maxY = _mm_max_ps( maxY, y );
		maxZ = _mm_max_ps( maxZ, z );
	}
	__m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 );
	minX = _mm_sub_ps( minX, expand );
	minY = _mm_sub_ps( minY, expand );
	minZ = _mm_sub_ps( minZ, expand );
	maxX = _mm_add_ps( maxX, expand );
	maxY = _mm_add_ps( maxY, expand );
	maxZ = _mm_add_ps( maxZ, expand );
	_mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) );
	_mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) );

}
Exemple #5
0
// data length must be 32, this function will produce 8 sorted sequence of 
// length 4
void simdOddEvenSort(__m128 *rData)
{    //odd even sort lanes, then transpose them
    const int pairSize = rArrayLen >> 1;
    __m128 temp[pairSize];
    for (int i = 0; i < pairSize; ++i) temp[i] = rData[2 * i];
    for (int i = 0; i < rArrayLen; i += 2)
        rData[i] = _mm_min_ps(rData[i], rData[i + 1]);
    for (int i = 1; i < rArrayLen; i += 2)
        rData[i] = _mm_max_ps(rData[i], temp[i >> 1]);
	
    for (int i = 0; i < pairSize; i += 2)
    {
        temp[i] = rData[i * 2];
        temp[i + 1] = rData[i * 2 + 1];
    }
	
    for (int i = 0; i < rArrayLen; i += 4)
    {
        rData[i] = _mm_min_ps(rData[i], rData[i + 2]);
        rData[i + 1] = _mm_min_ps(rData[i + 1], rData[i + 3]);
    }
	
    for (int i = 2; i < rArrayLen; i += 4)
    {
        rData[i] = _mm_max_ps(rData[i], temp[(i >> 1) - 1]);
        rData[i + 1] = _mm_max_ps(rData[i + 1], temp[i >> 1]);
    }
	
    //TODO:portability?
    for (int i = 0; i < rArrayLen >> 2; ++i) temp[i] = rData[i * 4 + 1];
    for (int i = 1; i < rArrayLen; i += 4)
        rData[i] = _mm_min_ps(rData[i], rData[i + 1]);
    for (int i = 2; i < rArrayLen; i += 4)
        rData[i] = _mm_max_ps(rData[i], temp[i >> 2]);
	
    //temp,0,1,2,3
    for (int i = 0; i < rArrayLen; i += 2)
        temp[i >> 1] = _mm_shuffle_ps(rData[i], rData[i + 1], 0x44);
    //rdata,1,3,5,7
    for (int i = 1; i < rArrayLen; i += 2)
        rData[i] = _mm_shuffle_ps(rData[i], rData[i - 1], 0xee);
    //rdata,0,4
    for (int i = 0; i < pairSize; i += 2)
        rData[i * 2] = _mm_shuffle_ps(temp[i], temp[i + 1], 0x88);
    //rdata,2,6,depend,1,3,5,7
    for (int i = 2; i < rArrayLen; i += 4)
        rData[i] = _mm_shuffle_ps(rData[i - 1], rData[i + 1], 0x22);
    //rdata,3,7,depend,1,5
    for (int i = 3; i < rArrayLen; i += 4)
        rData[i] = _mm_shuffle_ps(rData[i - 2], rData[i], 0x77);
    //rdata,1,5
    for (int i = 0; i < pairSize; i += 2)
        rData[i * 2 + 1] = _mm_shuffle_ps(temp[i], temp[i + 1], 0xdd);
	
}
    SIMDValue SIMDFloat32x4Operation::OpMinNum(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue mask, mask2, t1, t2;

        // This is the correct result or b if either is NaN or both are +/-0.0
        x86Result.m128_value = _mm_min_ps(tmpaValue.m128_value, tmpbValue.m128_value);
        // Find NaNs in b
        mask.m128_value  = _mm_cmpunord_ps(tmpbValue.m128_value, tmpbValue.m128_value);
        // Find -0.0 in a
        mask2.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, X86_TWO_31_I4.m128i_value);
        // mask2 is -0.0 where a is -0.0
        mask2.m128_value = _mm_and_ps(mask2.m128_value, X86_TWO_31_I4.m128_value);
        // For lanes where a is -0.0, the result is either correct (negative), or b which is possibly +0.0
        // Safe to force sign to negative for those lanes, +0.0 becomes -0.0.
        x86Result.m128_value = _mm_or_ps(x86Result.m128_value, mask2.m128_value);
        // For NaNs in b, choose a, else keep result.
        t1.m128_value = _mm_and_ps(tmpaValue.m128_value, mask.m128_value);
        t2.m128_value = _mm_andnot_ps(mask.m128_value, x86Result.m128_value);
        x86Result.m128_value = _mm_or_ps(t1.m128_value, t2.m128_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
    SIMDValue SIMDFloat32x4Operation::OpMin(const SIMDValue& aValue, const SIMDValue& bValue)
    {
        X86SIMDValue x86Result;
        X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue);
        X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue);
        X86SIMDValue tmp1, tmp2;

        // if tmp1 and tmp2 are not identical then either
        // 1) at least one value is NaN, then the OR will set that lane to NaN
        // 2) one value is 0.0 and the other is -0.0, the OR will set the sign bit to have -0.0
        tmp1.m128_value = _mm_min_ps(tmpaValue.m128_value, tmpbValue.m128_value);
        tmp2.m128_value = _mm_min_ps(tmpbValue.m128_value, tmpaValue.m128_value);
        x86Result.m128_value = _mm_or_ps(tmp1.m128_value, tmp2.m128_value);

        return X86SIMDValue::ToSIMDValue(x86Result);
    }
Exemple #8
0
static inline __m128 sigmoid_positive_ps( __m128 xin )
{
	union {
		__m128i i;
		int32_t i32[4];
	} i;
	__m128 ex;
	float *ex_elem = (float*) &ex;
	__m128 x1 = _mm_min_ps( xin, tens.ps );

	x1 = _mm_mul_ps( x1, tens.ps );
	i.i  = _mm_cvttps_epi32( x1 );
 	ex_elem[0] = e[i.i32[0]];
	ex_elem[1] = e[i.i32[1]];
	ex_elem[2] = e[i.i32[2]];
	ex_elem[3] = e[i.i32[3]];
	x1 = _mm_sub_ps( x1, _mm_cvtepi32_ps( i.i ) ); 
	x1 = _mm_add_ps( x1, tens.ps );
	x1 = _mm_mul_ps( x1, ex );
	x1 = _mm_add_ps( x1, ones.ps ); 
#ifdef __FAST_MATH__
	return _mm_rcp_ps( x1 );
#else
	return _mm_div_ps( ones.ps, x1 );
#endif
}
Exemple #9
0
void auryn_vector_float_clip( auryn_vector_float * v, const float a, const float b ) {
#ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY
	#ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS
	for ( NeuronID i = 0 ; i < v->size ; ++i ) {
		if ( v->data[i] < a ) {
			v->data[i] = a;
		} else 
			if ( v->data[i] > b ) 
				v->data[i] = b;
	}
	#else
	const __m128 lo = _mm_set1_ps(a);
	const __m128 hi = _mm_set1_ps(b);
	for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS )
	{
		__m128 chunk = sse_load( i );
		__m128 result = _mm_min_ps(chunk, hi);
		result = _mm_max_ps(result, lo);
		sse_store( i, result );
	}
	#endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */
#else
	for ( NeuronID i = 0 ; i < v->size ; ++i ) {
		if ( v->data[i] < a ) {
			v->data[i] = a;
		} else 
			if ( v->data[i] > b ) 
				v->data[i] = b;
	}
#endif
}
Exemple #10
0
/** process, all real work is done here. */
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out)
{
  // this is called for preview and full pipe separately, each with its own pixelpipe piece.
  assert(dt_iop_module_colorspace(self) == iop_cs_Lab);
  // get our data struct:
  dt_iop_colorcontrast_params_t *d = (dt_iop_colorcontrast_params_t *)piece->data;
  // how many colors in our buffer?
  const int ch = piece->colors;
  // iterate over all output pixels (same coordinates as input)
#ifdef _OPENMP
  // optional: parallelize it!
  #pragma omp parallel for default(none) schedule(static) shared(i,o,roi_in,roi_out,d)
#endif
  for(int j=0; j<roi_out->height; j++)
  {

    float *in  = ((float *)i) + ch*roi_in->width *j;
    float *out = ((float *)o) + ch*roi_out->width*j;

    const __m128 scale = _mm_set_ps(0.0f,d->b_steepness,d->a_steepness,1.0f);
    const __m128 offset = _mm_set_ps(0.0f,d->b_offset,d->a_offset,0.0f);
    const __m128 min = _mm_set_ps(0.0f,-128.0f,-128.0f, -INFINITY);
    const __m128 max = _mm_set_ps(0.0f, 128.0f, 128.0f,  INFINITY);


    for(int i=0; i<roi_out->width; i++)
    {
      _mm_stream_ps(out,_mm_min_ps(max,_mm_max_ps(min,_mm_add_ps(offset,_mm_mul_ps(scale,_mm_load_ps(in))))));
      in+=ch;
      out+=ch;
    }
  }
  _mm_sfence();
}
Exemple #11
0
static void
clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1)
{
  __m128 xmm1;
  float max = *src2_1;

  /* Initial operations to align the destination pointer */
  for (; ((long)dest & 15) && (n > 0); n--) {
    float x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
  xmm1 = _mm_set_ps1(max);
  for (; n >= 4; n -= 4) {
    __m128 xmm0;
    xmm0 = _mm_loadu_ps(src1);
    xmm0 = _mm_min_ps(xmm0, xmm1);
    _mm_store_ps(dest, xmm0);
    dest += 4;
    src1 += 4;
  }
  for (; n > 0; n--) {
    float x = *src1++;
    if (x > max)
      x = max;
    *dest++ = x;
  }
}
Exemple #12
0
__m128 exp_128(
  const __m128& x) {

  //! Clip the value
  __m128 y = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(88.3762626647949f)),
                                      _mm_set1_ps(-88.3762626647949f));

  //! Express exp(x) as exp(g + n * log(2))
  __m128 fx = y * _mm_set1_ps(1.44269504088896341) + _mm_set1_ps(0.5f);

  //! Floor
  const __m128 tmp = _mm_round_ps(fx, _MM_FROUND_TO_ZERO);

  //! If greater, substract 1
  const __m128 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), _mm_set1_ps(1.f));
  fx = tmp - mask;

  y -= fx * _mm_set1_ps(0.693359375 - 2.12194440e-4);
  const __m128 z = y * y;


  const __m128 t = (((((_mm_set1_ps(1.9875691500E-4)  * y +
                        _mm_set1_ps(1.3981999507E-3)) * y +
                        _mm_set1_ps(8.3334519073E-3)) * y +
                        _mm_set1_ps(4.1665795894E-2)) * y +
                        _mm_set1_ps(1.6666665459E-1)) * y +
                        _mm_set1_ps(5.0000001201E-1)) * z + y + _mm_set1_ps(1.f);

  //! Build 2^n
  const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(fx), _mm_set1_epi32(0x7f));

  //! Return the result
  return t * _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
}
Exemple #13
0
static void inline histogram_helper_cs_rgb_helper_process_pixel_m128(
    const dt_dev_histogram_collection_params_t *const histogram_params, const float *pixel,
    uint32_t *histogram)
{
  const float fscale = (float)(histogram_params->bins_count);

  const __m128 scale = _mm_set1_ps(fscale);
  const __m128 val_min = _mm_setzero_ps();
  const __m128 val_max = _mm_set1_ps(histogram_params->bins_count - 1);

  assert(dt_is_aligned(pixel, 16));
  const __m128 input = _mm_load_ps(pixel);
  const __m128 scaled = _mm_mul_ps(input, scale);
  const __m128 clamped = _mm_max_ps(_mm_min_ps(scaled, val_max), val_min);

  const __m128i indexes = _mm_cvtps_epi32(clamped);

  __m128i values __attribute__((aligned(16)));
  _mm_store_si128(&values, indexes);

  const uint32_t *valuesi = (uint32_t *)(&values);

  histogram[4 * valuesi[0]]++;
  histogram[4 * valuesi[1] + 1]++;
  histogram[4 * valuesi[2] + 2]++;
}
Exemple #14
0
// Calcule le minimum membre a membre de deux tableaux de Packed Single SSE (= 4 floats) dans un troisieme tableau
// La taille est exprimee en nombre de vecteurs (et pas en nombre de floats)
// Les tableaux doivent etre alignes sur des frontieres de 16 octets
void minimumVecteur_Et_Dans_DeTaille(__m128 *source1, __m128 *source2, __m128 *destination, int taille)
{
	int compteur;
	
	for (compteur = 0; compteur < taille; ++compteur)
		*destination++ = _mm_min_ps(*source1++, *source2++);
}
static inline __m128 
exp2f4(__m128 x)
{
	__m128i ipart;
	__m128 fpart, expipart, expfpart;

	x = _mm_min_ps(x, _mm_load_ps(_one29_ps));
	x = _mm_max_ps(x, _mm_load_ps(_minusone27_ps));

	/* ipart = int(x - 0.5) */
	ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_load_ps(_half_ps)));

	/* fpart = x - ipart */
	fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart));

	/* expipart = (float) (1 << ipart) */
	expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_load_si128((__m128i*)_one27)), 23));

	/* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */
#if EXP_POLY_DEGREE == 5
	expfpart = POLY5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, exp_p5_4, exp_p5_5);
#elif EXP_POLY_DEGREE == 4
	expfpart = POLY4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, exp_p4_4);
#elif EXP_POLY_DEGREE == 3
	expfpart = POLY3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3);
#elif EXP_POLY_DEGREE == 2
	expfpart = POLY2(fpart, exp_p2_0, exp_p2_1, exp_p2_2);
#else
#error
#endif

	return _mm_mul_ps(expipart, expfpart);
}
Exemple #16
0
	// ----------------------------------------------------------
	//  Name:   matrix::MinValue
	//  Desc:   Returns the asbolute minimum element of the
	//				matrix.
	// ----------------------------------------------------------
	float matrix::MinValue() {
#ifdef _M_IX86
		F32vec4 min1 = _mm_min_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2));
		F32vec4 min2 = _mm_min_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4));
		F32vec4 min = _mm_min_ps(min1, min2);
		min = _mm_min_ps(min, _mm_movehl_ps(min,min));
		min = _mm_min_ss(min, _mm_shuffle_ps(min,min,0x01));
		return min[0];
#else
		float min = this->operator()(0, 0);
		for (int i = 0; i < 4; ++i)
			for (int j = 0; j < 4; ++j)
				if (this->operator()(i, j) < min) 
					min = this->operator()(i, j);
		return min;
#endif // _M_IX86 
	}
v4sf DisplayFunctionGGBA::inv_display( v4sf L )
{
  const v4sf voffset = _mm_set1_ps(L_offset);
  const v4sf vmax = _mm_set1_ps(L_max);
  L = _mm_max_ps(L, voffset);
  L = _mm_min_ps(L, voffset+vmax);
  return _mm_pow_ps((L - voffset)/(vmax-_mm_set1_ps(L_black)), _mm_set1_ps(1.0f/gamma));
}
Exemple #18
0
pixel_t min_layer_mode_fun::operator()( const pixel_t& back, const pixel_t& front) const
{
    __m128 b = _mm_load_ps( reinterpret_cast<const float*>( &back));
    __m128 f = _mm_load_ps( reinterpret_cast<const float*>( &front));

    f = lerp( _mm_min_ps( f, b), b);
    float *p = reinterpret_cast<float*>( &f);
    return pixel_t( p[0], p[1], p[2], p[3]);
}
Exemple #19
0
static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h>
{
	const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f);
	const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f);
	const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f);

	const __m128 _ps_pi    = _mm_set1_ps(pi);
	const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5);

	const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000));
	const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000));

	__m128 mm1, mm2, mm3;
	__m128 axm, aym;

	__m128 xm = _mm_set1_ps(x);
	__m128 ym = _mm_set1_ps(y);

	axm = _mm_and_ps(xm, _mask_sign_inv);
	aym = _mm_and_ps(ym, _mask_sign_inv);

	mm1 = _mm_min_ps(axm, aym);
	mm2 = _mm_max_ps(axm, aym);
	mm1 = _mm_div_ps(mm1, mm2);
	mm2 = _mm_mul_ps(mm1, mm1);

	mm3 = _mm_mul_ps(mm2, _ps_atan_p0);
	mm3 = _mm_add_ps(mm3, _ps_atan_p1);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_sub_ps(mm3, _ps_atan_p2);
	mm3 = _mm_mul_ps(mm3, mm2);
	mm3 = _mm_mul_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	__m128 mask;

	/* |y| > |x| */
	mask = _mm_cmpgt_ss(aym, axm);
	mm2 = _mm_and_ps(_ps_pi0p5, mask);
	mm1 = _mm_and_ps(_mask_sign_raw, mask);
	mm3 = _mm_xor_ps(mm3, mm1);
	mm3 = _mm_add_ps(mm3, mm2);

	/* x < 0 */
	mask = _mm_and_ps(xm, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mask);
	mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30));
	mm1 = _mm_and_ps(_ps_pi, mm1);
	mm3 = _mm_add_ps(mm3, mm1);

	/* y < 0 */
	mm1 = _mm_and_ps(ym, _mask_sign_raw);
	mm3 = _mm_xor_ps(mm3, mm1);

	return _mm_cvtss_f32(mm3);
}
Exemple #20
0
// SIMD min
__SIMD _SIMD_min_ps(__SIMD a, __SIMD b)
{
#ifdef  USE_SSE
  return _mm_min_ps(a,b); 
#elif defined USE_AVX
  return _mm256_min_ps(a,b); 
#elif defined USE_IBM
  return vec_min(a,b);
#endif
}
Exemple #21
0
static void update_bounding_box (struct vox_box *box, const vox_dot dot)
{
    __v4sf d = _mm_load_ps (dot);
    __v4sf d_max = d + _mm_load_ps (vox_voxel);

    __v4sf box_min = _mm_min_ps (d, _mm_load_ps (box->min));
    __v4sf box_max = _mm_max_ps (d_max, _mm_load_ps (box->max));

    _mm_store_ps (box->min, box_min);
    _mm_store_ps (box->max, box_max);
}
Exemple #22
0
	INLINE void SVec4::Min(const SVec4 &other)
	{
#ifdef USE_SSE
		m_128 = _mm_min_ps( m_128, other.m_128 );
#else
        m_x = math::Min(m_x, other.X());
        m_y = math::Min(m_y, other.Y());
        m_z = math::Min(m_z, other.Z());
        m_w = math::Min(m_w, other.W());
#endif
	}
inline void GDALCopy2WordsSSE(const float* pValueIn, Tout* const &pValueOut)
{
    float fMaxVal, fMinVal;
    GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal);
    __m128 xmm = _mm_set_ps(0, 0, pValueIn[1], pValueIn[0]);
    __m128 xmm_min = _mm_set_ps(0, 0, fMinVal, fMinVal);
    __m128 xmm_max = _mm_set_ps(0, 0, fMaxVal, fMaxVal);
    xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max);
    pValueOut[0] = _mm_cvtss_si32(xmm);
    pValueOut[1] = _mm_cvtss_si32(_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(0, 0, 0, 1)));
}
Exemple #24
0
static inline __m128 curve_vec4(
    const __m128 x,
    const __m128 g,
    const __m128 sigma,
    const __m128 shadows,
    const __m128 highlights,
    const __m128 clarity)
{
  // TODO: pull these non-data depedent constants out of the loop to see
  // whether the compiler fail to do so
  const __m128 const0 = _mm_set_ps1(0x3f800000u);
  const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x
  const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31
  const __m128 one = _mm_set1_ps(1.0f);
  const __m128 two = _mm_set1_ps(2.0f);
  const __m128 twothirds = _mm_set1_ps(2.0f/3.0f);
  const __m128 twosig = _mm_mul_ps(two, sigma);
  const __m128 sigma2 = _mm_mul_ps(sigma, sigma);
  const __m128 s22 = _mm_mul_ps(twothirds, sigma2);

  const __m128 c = _mm_sub_ps(x, g);
  const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps());
  // select shadows or highlights as multiplier for linear part, based on c < 0
  const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights));
  // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma)
  const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask));
  // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma
  const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma))));

  const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(),
        _mm_div_ps(c, _mm_mul_ps(two, ssigma))));
  const __m128 t2 = _mm_mul_ps(t, t);
  const __m128 mt = _mm_sub_ps(one, t);

  // midtone value fading over to linear part, without local contrast:
  const __m128 vmid = _mm_add_ps(g,
      _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)),
        _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi)))));

  // c > 2*sigma?
  const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig);
  const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid));

  // midtone local contrast
  // dt_fast_expf in sse:
  const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22));
  const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0)));
  const __m128 k = _mm_max_ps(k0, _mm_setzero_ps());
  const __m128i ki = _mm_cvtps_epi32(k);
  const __m128 gauss = _mm_load_ps((float*)&ki);
  const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss));
  return _mm_add_ps(val, vcon);
}
Exemple #25
0
__m128 exp_ps(__m128 x) {
    typedef __m128 v4sf;
    typedef __m128i v4si;

    v4sf tmp = _mm_setzero_ps(), fx;
    v4si emm0;
    v4sf one = constants::ps_1.ps;

    x = _mm_min_ps(x, constants::exp_hi.ps);
    x = _mm_max_ps(x, constants::exp_lo.ps);

    /* express exp(x) as exp(g + n*log(2)) */
    fx = _mm_mul_ps(x,  constants::cephes_LOG2EF.ps);
    fx = _mm_add_ps(fx, constants::ps_0p5.ps);

    /* how to perform a floorf with SSE: just below */
    emm0 = _mm_cvttps_epi32(fx);
    tmp  = _mm_cvtepi32_ps(emm0);
    /* if greater, substract 1 */
    v4sf mask = _mm_cmpgt_ps(tmp, fx);
    mask = _mm_and_ps(mask, one);
    fx = _mm_sub_ps(tmp, mask);

    tmp = _mm_mul_ps(fx, constants::cephes_exp_C1.ps);
    v4sf z = _mm_mul_ps(fx, constants::cephes_exp_C2.ps);
    x = _mm_sub_ps(x, tmp);
    x = _mm_sub_ps(x, z);

    z = _mm_mul_ps(x,x);

    v4sf y = constants::cephes_exp_p0.ps;
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p1.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p2.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p3.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p4.ps);
    y = _mm_mul_ps(y, x);
    y = _mm_add_ps(y, constants::cephes_exp_p5.ps);
    y = _mm_mul_ps(y, z);
    y = _mm_add_ps(y, x);
    y = _mm_add_ps(y, one);

    /* build 2^n */
    emm0 = _mm_cvttps_epi32(fx);
    emm0 = _mm_add_epi32(emm0, constants::pi32_0x7f.pi);
    emm0 = _mm_slli_epi32(emm0, 23);
    v4sf pow2n = _mm_castsi128_ps(emm0);
    y = _mm_mul_ps(y, pow2n);
    return y;
}
Exemple #26
0
	INLINE SVec4 SVec4::Min(const SVec4 &vec1, const SVec4 &vec2)
	{
#ifdef USE_SSE
		return SVec4( _mm_min_ps( vec1.m_128, vec2.m_128 ) );
#else
        return{
            math::Min(vec1.X(), vec2.X()),
            math::Min(vec1.Y(), vec2.Y()),
            math::Min(vec1.Z(), vec2.Z()),
            math::Min(vec1.W(), vec2.W())
        };
#endif
	}
static void process_clip_sse2(dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid,
                              const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out,
                              const float clip)
{
  if(piece->pipe->dsc.filters)
  { // raw mosaic
    const __m128 clipm = _mm_set1_ps(clip);
    const size_t n = (size_t)roi_out->height * roi_out->width;
    float *const out = (float *)ovoid;
    float *const in = (float *)ivoid;
#ifdef _OPENMP
#pragma omp parallel for schedule(static) default(none)
#endif
    for(size_t j = 0; j < (n & ~3u); j += 4) _mm_stream_ps(out + j, _mm_min_ps(clipm, _mm_load_ps(in + j)));
    _mm_sfence();
    // lets see if there's a non-multiple of four rest to process:
    if(n & 3)
      for(size_t j = n & ~3u; j < n; j++) out[j] = MIN(clip, in[j]);
  }
  else
  {
    const __m128 clipm = _mm_set1_ps(clip);
    const int ch = piece->colors;

#ifdef _OPENMP
#pragma omp parallel for default(none) schedule(static)
#endif
    for(int j = 0; j < roi_out->height; j++)
    {
      float *out = (float *)ovoid + (size_t)ch * roi_out->width * j;
      float *in = (float *)ivoid + (size_t)ch * roi_in->width * j;
      for(int i = 0; i < roi_out->width; i++, in += ch, out += ch)
      {
        _mm_stream_ps(out, _mm_min_ps(clipm, _mm_set_ps(in[3], in[2], in[1], in[0])));
      }
    }
    _mm_sfence();
  }
}
void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF,
				 int imgFOfssetH, int imgFOfssetW,
				 float* filter, float *imgFloatSrc, float *imgFloatDst)
{

	//1.
	const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0);
	//2.
    const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0);

	//3.
	__declspec(align(16)) __m128 filter_l[FILTER_SIZE];
#pragma omp parallel for
	for (auto i = 0; i < FILTER_SIZE; i++)
	{
		//mind a 4 floatba ugyanazt tölti
		// float -> m128 konverzió
		filter_l[i] = _mm_load_ps1(filter + i);
	}
	const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2;
	const auto imgWidthbyte = imgWidth << 2;
	const auto imgWidthFbyte = imgWidthF << 2;
	const auto imgLengthbyte = imgHeight * imgWidthbyte;
	//4.
	register __declspec(align(16)) __m128 a_sse;
	//8. reg
	register __declspec(align(16)) __m128 r_sse;

#pragma omp parallel for
	for (auto row = 0; row < imgLengthbyte; row += 4)
	{
		// RGBA komponensek akkumulátora
		r_sse = _mm_setzero_ps();
		// konvolúció minden komponensre
		for (auto y = 0; y < FILTER_H; y++ )
		{		
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y]));
			r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y]));
		}
			
		a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte);
		//számítás eredményének limitálása 0-255 közé
		// kimenetí pixel írása
		_mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse)))))));
	}
}
/* Adjust MBR to fit all child MBRs */
inline 
void 
adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur)
{
	ArraySTRNode *node, *child;
	ulong_t k;
	
	node = &nodes[cur];
	child = &nodes[node->pos];
	
	/* enlarge mbr to include all childlen's mbr */
#ifdef ENABLE_SSE_ADJUST 
	{
		__m128 v_nlow = _mm_load_ps(child[0].mbr.low);
		__m128 v_nupp = _mm_load_ps(child[0].mbr.upp);
		for (k = 1; k < node->len; k++) {
			v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low));
			v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp));
		}
		_mm_store_ps(node->mbr.low, v_nlow);
		_mm_store_ps(node->mbr.upp, v_nupp);
	}
#else
#ifdef ENABLE_AVX_TEST1	
	{
		__m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr);
		for (k = 1; k < node->len; k++) {
			__m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr);
			__m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr);
			__m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr);
			v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12);
		}
		_mm256_storeu_ps((float *)&node->mbr, v_nmbr);
	}
#else
	/* copy first child's mbr */
	node->mbr = child[0].mbr;
	for (k = 1; k < node->len; k++) {
		int i;
		for (i = 0; i < NDIMS; i++) {
			if (node->mbr.low[i] > child[k].mbr.low[i])
				node->mbr.low[i] = child[k].mbr.low[i];
			if (node->mbr.upp[i] < child[k].mbr.upp[i])
				node->mbr.upp[i] = child[k].mbr.upp[i];
		}
	}
#endif
#endif
}
Exemple #30
0
static void foo(
	float(&inout)[8]) {

	__m128& a = reinterpret_cast<__m128&>(inout[0]);
	__m128& b = reinterpret_cast<__m128&>(inout[4]);

	__m128 min = _mm_min_ps(a, b);
	__m128 max = _mm_max_ps(a, b);

	a = _mm_shuffle_ps(min, max, _MM_SHUFFLE(0, 1, 0, 1));
	b = _mm_shuffle_ps(min, max, _MM_SHUFFLE(2, 3, 2, 3));
	min = _mm_min_ps(a, b);
	max = _mm_max_ps(a, b);

	a = _mm_shuffle_ps(min, max, _MM_SHUFFLE(0, 2, 0, 2));
	b = _mm_shuffle_ps(min, max, _MM_SHUFFLE(1, 3, 1, 3));
	min = _mm_min_ps(a, b);
	max = _mm_max_ps(a, b);

	a = _mm_shuffle_ps(min, max, _MM_SHUFFLE(0, 1, 0, 1));
	b = _mm_shuffle_ps(min, max, _MM_SHUFFLE(2, 3, 2, 3));

	const size_t idx[][2] = {
		{ 1, 2 },{ 5, 6 },
		{ 2, 4 },{ 3, 5 },
		{ 1, 2 },{ 3, 4 },{ 5, 6 }
	};

	for (size_t i = 0; i < sizeof(idx) / sizeof(idx[0]); ++i) {
		const float x = inout[idx[i][0]];
		const float y = inout[idx[i][1]];

		inout[idx[i][0]] = std::min(x, y);
		inout[idx[i][1]] = std::max(x, y);
	}
}