__m128 _mm_pow_ps(__m128 a, __m128 b){//eleva a[i]^b[i] para todo a real, y b entero < 0 __m128 cero = _mm_set1_ps(0); __m128 uno = _mm_set1_ps(1.0);//se declaran las constantes [0,0,0,0] y [1,1,1,1] para ser usadas luego b = _mm_parteentera_ps(b);// para evitar problemas con decimales. b = _mm_sub_ps(b, uno);//b[i]-- b = _mm_max_ps(b, cero);// se busca el maximo con 0 para luego comparar y saber cuales deben o no deben ser __m128 activo= _mm_mul_ps(_mm_set1_ps(1.0), b);//elimina de los activos a los exponentes menores que 1 activo= _mm_min_ps(activo , uno);//con esto se setean en numeros 1 o 0 declarando activos a quienes su exponente sea mayor que 0 __m128 resul= a;//el resultado se iguala a la entrada a __m128 act_x_a;//se declara variable para ser usarda en el bucle while (!_mm_compare_ps(b, cero)) {//luego lo explico act_x_a = _mm_mul_ps(a, activo);//se elimina los activos de las variables a multiplicar para poder ejecutar las multiplicaciones en paralelo act_x_a = _mm_max_ps(act_x_a, uno);//las variables eliminadas se setean en 1 para que el resultado quede como en el estado anterior resul= _mm_mul_ps(resul, act_x_a);// se multiplican en paralelo lo la base por si misma tantas veces como sea indicado por el exponente b = _mm_sub_ps(b, uno);//b[i]-- b = _mm_max_ps(b, cero);// en caso de que la resta deje algun valor en negativo este se setea en 0 activo= _mm_mul_ps(activo, b);// se realiza nuevamente la busqueda de los que ya no estan activos que depende de los valores en los exponentes. activo= _mm_min_ps(activo , uno); } return resul;// se retorna un vector con los valores pedidos }

static void minimum_f32_sse_unroll2 (float *dest, float *src1, float *src2, int n) { /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { *dest++ = *src1 < *src2 ? *src1 : *src2; src1++; src2++; } for (; n >= 8; n -= 8) { __m128 xmm0, xmm1; xmm0 = _mm_loadu_ps(src1); xmm1 = _mm_loadu_ps(src2); xmm0 = _mm_min_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); xmm0 = _mm_loadu_ps(src1 + 4); xmm1 = _mm_loadu_ps(src2 + 4); xmm0 = _mm_min_ps(xmm0, xmm1); _mm_store_ps(dest + 4, xmm0); dest += 8; src1 += 8; src2 += 8; } for (; n > 0; n--) { *dest++ = *src1 < *src2 ? *src1 : *src2; src1++; src2++; } }

/*---------------------------------------------------------------------------*/ __m128 TTriangle::NearestTest4(__m128 mask, const TPoint4& orig, __m128 radius, HitResult4* result) const { TPoint4 p0(pos0); TPoint4 p1(pos1); TPoint4 p2(pos2); __m128 u0, u1, u2; __m128 d0 = ::CalcDistance4(p0, p1, orig, &u0); __m128 d1 = ::CalcDistance4(p1, p2, orig, &u1); __m128 d2 = ::CalcDistance4(p2, p0, orig, &u2); __m128 minimum = _mm_min_ps(d0, _mm_min_ps(d1, d2)); __m128 minMask; result->u = u0; result->v = g_zero4; result->t = d0; minMask = (minimum == d1); result->u = _mm_merge_ps(minMask, result->u, (g_one4 - u1)); result->v = _mm_merge_ps(minMask, result->v, u1); result->t = _mm_merge_ps(minMask, result->t, d1); minMask = (minimum == d2); result->u = _mm_merge_ps(minMask, result->u, g_zero4); result->v = _mm_merge_ps(minMask, result->v, g_one4 - u2); result->t = _mm_merge_ps(minMask, result->t, d2); return mask & (result->t < radius); }

/* ==================== idMD5Mesh::CalculateBounds ==================== */ void idMD5Mesh::CalculateBounds( const idJointMat * entJoints, idBounds & bounds ) const { __m128 minX = vector_float_posInfinity; __m128 minY = vector_float_posInfinity; __m128 minZ = vector_float_posInfinity; __m128 maxX = vector_float_negInfinity; __m128 maxY = vector_float_negInfinity; __m128 maxZ = vector_float_negInfinity; for ( int i = 0; i < numMeshJoints; i++ ) { const idJointMat & joint = entJoints[meshJoints[i]]; __m128 x = _mm_load_ps( joint.ToFloatPtr() + 0 * 4 ); __m128 y = _mm_load_ps( joint.ToFloatPtr() + 1 * 4 ); __m128 z = _mm_load_ps( joint.ToFloatPtr() + 2 * 4 ); minX = _mm_min_ps( minX, x ); minY = _mm_min_ps( minY, y ); minZ = _mm_min_ps( minZ, z ); maxX = _mm_max_ps( maxX, x ); maxY = _mm_max_ps( maxY, y ); maxZ = _mm_max_ps( maxZ, z ); } __m128 expand = _mm_splat_ps( _mm_load_ss( & maxJointVertDist ), 0 ); minX = _mm_sub_ps( minX, expand ); minY = _mm_sub_ps( minY, expand ); minZ = _mm_sub_ps( minZ, expand ); maxX = _mm_add_ps( maxX, expand ); maxY = _mm_add_ps( maxY, expand ); maxZ = _mm_add_ps( maxZ, expand ); _mm_store_ss( bounds.ToFloatPtr() + 0, _mm_splat_ps( minX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 1, _mm_splat_ps( minY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 2, _mm_splat_ps( minZ, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 3, _mm_splat_ps( maxX, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 4, _mm_splat_ps( maxY, 3 ) ); _mm_store_ss( bounds.ToFloatPtr() + 5, _mm_splat_ps( maxZ, 3 ) ); }

// data length must be 32, this function will produce 8 sorted sequence of // length 4 void simdOddEvenSort(__m128 *rData) { //odd even sort lanes, then transpose them const int pairSize = rArrayLen >> 1; __m128 temp[pairSize]; for (int i = 0; i < pairSize; ++i) temp[i] = rData[2 * i]; for (int i = 0; i < rArrayLen; i += 2) rData[i] = _mm_min_ps(rData[i], rData[i + 1]); for (int i = 1; i < rArrayLen; i += 2) rData[i] = _mm_max_ps(rData[i], temp[i >> 1]); for (int i = 0; i < pairSize; i += 2) { temp[i] = rData[i * 2]; temp[i + 1] = rData[i * 2 + 1]; } for (int i = 0; i < rArrayLen; i += 4) { rData[i] = _mm_min_ps(rData[i], rData[i + 2]); rData[i + 1] = _mm_min_ps(rData[i + 1], rData[i + 3]); } for (int i = 2; i < rArrayLen; i += 4) { rData[i] = _mm_max_ps(rData[i], temp[(i >> 1) - 1]); rData[i + 1] = _mm_max_ps(rData[i + 1], temp[i >> 1]); } //TODO:portability? for (int i = 0; i < rArrayLen >> 2; ++i) temp[i] = rData[i * 4 + 1]; for (int i = 1; i < rArrayLen; i += 4) rData[i] = _mm_min_ps(rData[i], rData[i + 1]); for (int i = 2; i < rArrayLen; i += 4) rData[i] = _mm_max_ps(rData[i], temp[i >> 2]); //temp,0,1,2,3 for (int i = 0; i < rArrayLen; i += 2) temp[i >> 1] = _mm_shuffle_ps(rData[i], rData[i + 1], 0x44); //rdata,1,3,5,7 for (int i = 1; i < rArrayLen; i += 2) rData[i] = _mm_shuffle_ps(rData[i], rData[i - 1], 0xee); //rdata,0,4 for (int i = 0; i < pairSize; i += 2) rData[i * 2] = _mm_shuffle_ps(temp[i], temp[i + 1], 0x88); //rdata,2,6,depend,1,3,5,7 for (int i = 2; i < rArrayLen; i += 4) rData[i] = _mm_shuffle_ps(rData[i - 1], rData[i + 1], 0x22); //rdata,3,7,depend,1,5 for (int i = 3; i < rArrayLen; i += 4) rData[i] = _mm_shuffle_ps(rData[i - 2], rData[i], 0x77); //rdata,1,5 for (int i = 0; i < pairSize; i += 2) rData[i * 2 + 1] = _mm_shuffle_ps(temp[i], temp[i + 1], 0xdd); }

SIMDValue SIMDFloat32x4Operation::OpMinNum(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue mask, mask2, t1, t2; // This is the correct result or b if either is NaN or both are +/-0.0 x86Result.m128_value = _mm_min_ps(tmpaValue.m128_value, tmpbValue.m128_value); // Find NaNs in b mask.m128_value = _mm_cmpunord_ps(tmpbValue.m128_value, tmpbValue.m128_value); // Find -0.0 in a mask2.m128i_value = _mm_cmpeq_epi32(tmpaValue.m128i_value, X86_TWO_31_I4.m128i_value); // mask2 is -0.0 where a is -0.0 mask2.m128_value = _mm_and_ps(mask2.m128_value, X86_TWO_31_I4.m128_value); // For lanes where a is -0.0, the result is either correct (negative), or b which is possibly +0.0 // Safe to force sign to negative for those lanes, +0.0 becomes -0.0. x86Result.m128_value = _mm_or_ps(x86Result.m128_value, mask2.m128_value); // For NaNs in b, choose a, else keep result. t1.m128_value = _mm_and_ps(tmpaValue.m128_value, mask.m128_value); t2.m128_value = _mm_andnot_ps(mask.m128_value, x86Result.m128_value); x86Result.m128_value = _mm_or_ps(t1.m128_value, t2.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }

SIMDValue SIMDFloat32x4Operation::OpMin(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); X86SIMDValue tmp1, tmp2; // if tmp1 and tmp2 are not identical then either // 1) at least one value is NaN, then the OR will set that lane to NaN // 2) one value is 0.0 and the other is -0.0, the OR will set the sign bit to have -0.0 tmp1.m128_value = _mm_min_ps(tmpaValue.m128_value, tmpbValue.m128_value); tmp2.m128_value = _mm_min_ps(tmpbValue.m128_value, tmpaValue.m128_value); x86Result.m128_value = _mm_or_ps(tmp1.m128_value, tmp2.m128_value); return X86SIMDValue::ToSIMDValue(x86Result); }

static inline __m128 sigmoid_positive_ps( __m128 xin ) { union { __m128i i; int32_t i32[4]; } i; __m128 ex; float *ex_elem = (float*) &ex; __m128 x1 = _mm_min_ps( xin, tens.ps ); x1 = _mm_mul_ps( x1, tens.ps ); i.i = _mm_cvttps_epi32( x1 ); ex_elem[0] = e[i.i32[0]]; ex_elem[1] = e[i.i32[1]]; ex_elem[2] = e[i.i32[2]]; ex_elem[3] = e[i.i32[3]]; x1 = _mm_sub_ps( x1, _mm_cvtepi32_ps( i.i ) ); x1 = _mm_add_ps( x1, tens.ps ); x1 = _mm_mul_ps( x1, ex ); x1 = _mm_add_ps( x1, ones.ps ); #ifdef __FAST_MATH__ return _mm_rcp_ps( x1 ); #else return _mm_div_ps( ones.ps, x1 ); #endif }

void auryn_vector_float_clip( auryn_vector_float * v, const float a, const float b ) { #ifdef CODE_USE_SIMD_INSTRUCTIONS_EXPLICITLY #ifdef CODE_ACTIVATE_CILK_INSTRUCTIONS for ( NeuronID i = 0 ; i < v->size ; ++i ) { if ( v->data[i] < a ) { v->data[i] = a; } else if ( v->data[i] > b ) v->data[i] = b; } #else const __m128 lo = _mm_set1_ps(a); const __m128 hi = _mm_set1_ps(b); for ( float * i = v->data ; i != v->data+v->size ; i += SIMD_NUM_OF_PARALLEL_FLOAT_OPERATIONS ) { __m128 chunk = sse_load( i ); __m128 result = _mm_min_ps(chunk, hi); result = _mm_max_ps(result, lo); sse_store( i, result ); } #endif /* CODE_ACTIVATE_CILK_INSTRUCTIONS */ #else for ( NeuronID i = 0 ; i < v->size ; ++i ) { if ( v->data[i] < a ) { v->data[i] = a; } else if ( v->data[i] > b ) v->data[i] = b; } #endif }

/** process, all real work is done here. */ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { // this is called for preview and full pipe separately, each with its own pixelpipe piece. assert(dt_iop_module_colorspace(self) == iop_cs_Lab); // get our data struct: dt_iop_colorcontrast_params_t *d = (dt_iop_colorcontrast_params_t *)piece->data; // how many colors in our buffer? const int ch = piece->colors; // iterate over all output pixels (same coordinates as input) #ifdef _OPENMP // optional: parallelize it! #pragma omp parallel for default(none) schedule(static) shared(i,o,roi_in,roi_out,d) #endif for(int j=0; j<roi_out->height; j++) { float *in = ((float *)i) + ch*roi_in->width *j; float *out = ((float *)o) + ch*roi_out->width*j; const __m128 scale = _mm_set_ps(0.0f,d->b_steepness,d->a_steepness,1.0f); const __m128 offset = _mm_set_ps(0.0f,d->b_offset,d->a_offset,0.0f); const __m128 min = _mm_set_ps(0.0f,-128.0f,-128.0f, -INFINITY); const __m128 max = _mm_set_ps(0.0f, 128.0f, 128.0f, INFINITY); for(int i=0; i<roi_out->width; i++) { _mm_stream_ps(out,_mm_min_ps(max,_mm_max_ps(min,_mm_add_ps(offset,_mm_mul_ps(scale,_mm_load_ps(in)))))); in+=ch; out+=ch; } } _mm_sfence(); }

static void clamphigh_f32_sse (float *dest, const float *src1, int n, const float *src2_1) { __m128 xmm1; float max = *src2_1; /* Initial operations to align the destination pointer */ for (; ((long)dest & 15) && (n > 0); n--) { float x = *src1++; if (x > max) x = max; *dest++ = x; } xmm1 = _mm_set_ps1(max); for (; n >= 4; n -= 4) { __m128 xmm0; xmm0 = _mm_loadu_ps(src1); xmm0 = _mm_min_ps(xmm0, xmm1); _mm_store_ps(dest, xmm0); dest += 4; src1 += 4; } for (; n > 0; n--) { float x = *src1++; if (x > max) x = max; *dest++ = x; } }

__m128 exp_128( const __m128& x) { //! Clip the value __m128 y = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(88.3762626647949f)), _mm_set1_ps(-88.3762626647949f)); //! Express exp(x) as exp(g + n * log(2)) __m128 fx = y * _mm_set1_ps(1.44269504088896341) + _mm_set1_ps(0.5f); //! Floor const __m128 tmp = _mm_round_ps(fx, _MM_FROUND_TO_ZERO); //! If greater, substract 1 const __m128 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), _mm_set1_ps(1.f)); fx = tmp - mask; y -= fx * _mm_set1_ps(0.693359375 - 2.12194440e-4); const __m128 z = y * y; const __m128 t = (((((_mm_set1_ps(1.9875691500E-4) * y + _mm_set1_ps(1.3981999507E-3)) * y + _mm_set1_ps(8.3334519073E-3)) * y + _mm_set1_ps(4.1665795894E-2)) * y + _mm_set1_ps(1.6666665459E-1)) * y + _mm_set1_ps(5.0000001201E-1)) * z + y + _mm_set1_ps(1.f); //! Build 2^n const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(fx), _mm_set1_epi32(0x7f)); //! Return the result return t * _mm_castsi128_ps(_mm_slli_epi32(emm0, 23)); }

static void inline histogram_helper_cs_rgb_helper_process_pixel_m128( const dt_dev_histogram_collection_params_t *const histogram_params, const float *pixel, uint32_t *histogram) { const float fscale = (float)(histogram_params->bins_count); const __m128 scale = _mm_set1_ps(fscale); const __m128 val_min = _mm_setzero_ps(); const __m128 val_max = _mm_set1_ps(histogram_params->bins_count - 1); assert(dt_is_aligned(pixel, 16)); const __m128 input = _mm_load_ps(pixel); const __m128 scaled = _mm_mul_ps(input, scale); const __m128 clamped = _mm_max_ps(_mm_min_ps(scaled, val_max), val_min); const __m128i indexes = _mm_cvtps_epi32(clamped); __m128i values __attribute__((aligned(16))); _mm_store_si128(&values, indexes); const uint32_t *valuesi = (uint32_t *)(&values); histogram[4 * valuesi[0]]++; histogram[4 * valuesi[1] + 1]++; histogram[4 * valuesi[2] + 2]++; }

// Calcule le minimum membre a membre de deux tableaux de Packed Single SSE (= 4 floats) dans un troisieme tableau // La taille est exprimee en nombre de vecteurs (et pas en nombre de floats) // Les tableaux doivent etre alignes sur des frontieres de 16 octets void minimumVecteur_Et_Dans_DeTaille(__m128 *source1, __m128 *source2, __m128 *destination, int taille) { int compteur; for (compteur = 0; compteur < taille; ++compteur) *destination++ = _mm_min_ps(*source1++, *source2++); }

static inline __m128 exp2f4(__m128 x) { __m128i ipart; __m128 fpart, expipart, expfpart; x = _mm_min_ps(x, _mm_load_ps(_one29_ps)); x = _mm_max_ps(x, _mm_load_ps(_minusone27_ps)); /* ipart = int(x - 0.5) */ ipart = _mm_cvtps_epi32(_mm_sub_ps(x, _mm_load_ps(_half_ps))); /* fpart = x - ipart */ fpart = _mm_sub_ps(x, _mm_cvtepi32_ps(ipart)); /* expipart = (float) (1 << ipart) */ expipart = _mm_castsi128_ps(_mm_slli_epi32(_mm_add_epi32(ipart, _mm_load_si128((__m128i*)_one27)), 23)); /* minimax polynomial fit of 2**x, in range [-0.5, 0.5[ */ #if EXP_POLY_DEGREE == 5 expfpart = POLY5(fpart, exp_p5_0, exp_p5_1, exp_p5_2, exp_p5_3, exp_p5_4, exp_p5_5); #elif EXP_POLY_DEGREE == 4 expfpart = POLY4(fpart, exp_p4_0, exp_p4_1, exp_p4_2, exp_p4_3, exp_p4_4); #elif EXP_POLY_DEGREE == 3 expfpart = POLY3(fpart, exp_p3_0, exp_p3_1, exp_p3_2, exp_p3_3); #elif EXP_POLY_DEGREE == 2 expfpart = POLY2(fpart, exp_p2_0, exp_p2_1, exp_p2_2); #else #error #endif return _mm_mul_ps(expipart, expfpart); }

// ---------------------------------------------------------- // Name: matrix::MinValue // Desc: Returns the asbolute minimum element of the // matrix. // ---------------------------------------------------------- float matrix::MinValue() { #ifdef _M_IX86 F32vec4 min1 = _mm_min_ps(_mm_abs_ps(_L1), _mm_abs_ps(_L2)); F32vec4 min2 = _mm_min_ps(_mm_abs_ps(_L3), _mm_abs_ps(_L4)); F32vec4 min = _mm_min_ps(min1, min2); min = _mm_min_ps(min, _mm_movehl_ps(min,min)); min = _mm_min_ss(min, _mm_shuffle_ps(min,min,0x01)); return min[0]; #else float min = this->operator()(0, 0); for (int i = 0; i < 4; ++i) for (int j = 0; j < 4; ++j) if (this->operator()(i, j) < min) min = this->operator()(i, j); return min; #endif // _M_IX86 }

v4sf DisplayFunctionGGBA::inv_display( v4sf L ) { const v4sf voffset = _mm_set1_ps(L_offset); const v4sf vmax = _mm_set1_ps(L_max); L = _mm_max_ps(L, voffset); L = _mm_min_ps(L, voffset+vmax); return _mm_pow_ps((L - voffset)/(vmax-_mm_set1_ps(L_black)), _mm_set1_ps(1.0f/gamma)); }

pixel_t min_layer_mode_fun::operator()( const pixel_t& back, const pixel_t& front) const { __m128 b = _mm_load_ps( reinterpret_cast<const float*>( &back)); __m128 f = _mm_load_ps( reinterpret_cast<const float*>( &front)); f = lerp( _mm_min_ps( f, b), b); float *p = reinterpret_cast<float*>( &f); return pixel_t( p[0], p[1], p[2], p[3]); }

static float Atan(float y, float x) // #include <emmintrin.h>, #include <xmmintrin.h> { const __m128 _ps_atan_p0 = _mm_set1_ps(-0.0464964749f); const __m128 _ps_atan_p1 = _mm_set1_ps(0.15931422f); const __m128 _ps_atan_p2 = _mm_set1_ps(0.327622764f); const __m128 _ps_pi = _mm_set1_ps(pi); const __m128 _ps_pi0p5 = _mm_set1_ps(pi0p5); const __m128 _mask_sign_raw = _mm_castsi128_ps(_mm_set1_epi32( 0x80000000)); const __m128 _mask_sign_inv = _mm_castsi128_ps(_mm_set1_epi32(~0x80000000)); __m128 mm1, mm2, mm3; __m128 axm, aym; __m128 xm = _mm_set1_ps(x); __m128 ym = _mm_set1_ps(y); axm = _mm_and_ps(xm, _mask_sign_inv); aym = _mm_and_ps(ym, _mask_sign_inv); mm1 = _mm_min_ps(axm, aym); mm2 = _mm_max_ps(axm, aym); mm1 = _mm_div_ps(mm1, mm2); mm2 = _mm_mul_ps(mm1, mm1); mm3 = _mm_mul_ps(mm2, _ps_atan_p0); mm3 = _mm_add_ps(mm3, _ps_atan_p1); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_sub_ps(mm3, _ps_atan_p2); mm3 = _mm_mul_ps(mm3, mm2); mm3 = _mm_mul_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm1); __m128 mask; /* |y| > |x| */ mask = _mm_cmpgt_ss(aym, axm); mm2 = _mm_and_ps(_ps_pi0p5, mask); mm1 = _mm_and_ps(_mask_sign_raw, mask); mm3 = _mm_xor_ps(mm3, mm1); mm3 = _mm_add_ps(mm3, mm2); /* x < 0 */ mask = _mm_and_ps(xm, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mask); mm1 = _mm_castsi128_ps(_mm_srai_epi32(_mm_castps_si128(mm3), 30)); mm1 = _mm_and_ps(_ps_pi, mm1); mm3 = _mm_add_ps(mm3, mm1); /* y < 0 */ mm1 = _mm_and_ps(ym, _mask_sign_raw); mm3 = _mm_xor_ps(mm3, mm1); return _mm_cvtss_f32(mm3); }

// SIMD min __SIMD _SIMD_min_ps(__SIMD a, __SIMD b) { #ifdef USE_SSE return _mm_min_ps(a,b); #elif defined USE_AVX return _mm256_min_ps(a,b); #elif defined USE_IBM return vec_min(a,b); #endif }

static void update_bounding_box (struct vox_box *box, const vox_dot dot) { __v4sf d = _mm_load_ps (dot); __v4sf d_max = d + _mm_load_ps (vox_voxel); __v4sf box_min = _mm_min_ps (d, _mm_load_ps (box->min)); __v4sf box_max = _mm_max_ps (d_max, _mm_load_ps (box->max)); _mm_store_ps (box->min, box_min); _mm_store_ps (box->max, box_max); }

INLINE void SVec4::Min(const SVec4 &other) { #ifdef USE_SSE m_128 = _mm_min_ps( m_128, other.m_128 ); #else m_x = math::Min(m_x, other.X()); m_y = math::Min(m_y, other.Y()); m_z = math::Min(m_z, other.Z()); m_w = math::Min(m_w, other.W()); #endif }

inline void GDALCopy2WordsSSE(const float* pValueIn, Tout* const &pValueOut) { float fMaxVal, fMinVal; GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal); __m128 xmm = _mm_set_ps(0, 0, pValueIn[1], pValueIn[0]); __m128 xmm_min = _mm_set_ps(0, 0, fMinVal, fMinVal); __m128 xmm_max = _mm_set_ps(0, 0, fMaxVal, fMaxVal); xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max); pValueOut[0] = _mm_cvtss_si32(xmm); pValueOut[1] = _mm_cvtss_si32(_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(0, 0, 0, 1))); }

static inline __m128 curve_vec4( const __m128 x, const __m128 g, const __m128 sigma, const __m128 shadows, const __m128 highlights, const __m128 clarity) { // TODO: pull these non-data depedent constants out of the loop to see // whether the compiler fail to do so const __m128 const0 = _mm_set_ps1(0x3f800000u); const __m128 const1 = _mm_set_ps1(0x402DF854u); // for e^x const __m128 sign_mask = _mm_set1_ps(-0.f); // -0.f = 1 << 31 const __m128 one = _mm_set1_ps(1.0f); const __m128 two = _mm_set1_ps(2.0f); const __m128 twothirds = _mm_set1_ps(2.0f/3.0f); const __m128 twosig = _mm_mul_ps(two, sigma); const __m128 sigma2 = _mm_mul_ps(sigma, sigma); const __m128 s22 = _mm_mul_ps(twothirds, sigma2); const __m128 c = _mm_sub_ps(x, g); const __m128 select = _mm_cmplt_ps(c, _mm_setzero_ps()); // select shadows or highlights as multiplier for linear part, based on c < 0 const __m128 shadhi = _mm_or_ps(_mm_andnot_ps(select, shadows), _mm_and_ps(select, highlights)); // flip sign bit of sigma based on c < 0 (c < 0 ? - sigma : sigma) const __m128 ssigma = _mm_xor_ps(sigma, _mm_and_ps(select, sign_mask)); // this contains the linear parts valid for c > 2*sigma or c < - 2*sigma const __m128 vlin = _mm_add_ps(g, _mm_add_ps(ssigma, _mm_mul_ps(shadhi, _mm_sub_ps(c, ssigma)))); const __m128 t = _mm_min_ps(one, _mm_max_ps(_mm_setzero_ps(), _mm_div_ps(c, _mm_mul_ps(two, ssigma)))); const __m128 t2 = _mm_mul_ps(t, t); const __m128 mt = _mm_sub_ps(one, t); // midtone value fading over to linear part, without local contrast: const __m128 vmid = _mm_add_ps(g, _mm_add_ps(_mm_mul_ps(_mm_mul_ps(ssigma, two), _mm_mul_ps(mt, t)), _mm_mul_ps(t2, _mm_add_ps(ssigma, _mm_mul_ps(ssigma, shadhi))))); // c > 2*sigma? const __m128 linselect = _mm_cmpgt_ps(_mm_andnot_ps(sign_mask, c), twosig); const __m128 val = _mm_or_ps(_mm_and_ps(linselect, vlin), _mm_andnot_ps(linselect, vmid)); // midtone local contrast // dt_fast_expf in sse: const __m128 arg = _mm_xor_ps(sign_mask, _mm_div_ps(_mm_mul_ps(c, c), s22)); const __m128 k0 = _mm_add_ps(const0, _mm_mul_ps(arg, _mm_sub_ps(const1, const0))); const __m128 k = _mm_max_ps(k0, _mm_setzero_ps()); const __m128i ki = _mm_cvtps_epi32(k); const __m128 gauss = _mm_load_ps((float*)&ki); const __m128 vcon = _mm_mul_ps(clarity, _mm_mul_ps(c, gauss)); return _mm_add_ps(val, vcon); }

__m128 exp_ps(__m128 x) { typedef __m128 v4sf; typedef __m128i v4si; v4sf tmp = _mm_setzero_ps(), fx; v4si emm0; v4sf one = constants::ps_1.ps; x = _mm_min_ps(x, constants::exp_hi.ps); x = _mm_max_ps(x, constants::exp_lo.ps); /* express exp(x) as exp(g + n*log(2)) */ fx = _mm_mul_ps(x, constants::cephes_LOG2EF.ps); fx = _mm_add_ps(fx, constants::ps_0p5.ps); /* how to perform a floorf with SSE: just below */ emm0 = _mm_cvttps_epi32(fx); tmp = _mm_cvtepi32_ps(emm0); /* if greater, substract 1 */ v4sf mask = _mm_cmpgt_ps(tmp, fx); mask = _mm_and_ps(mask, one); fx = _mm_sub_ps(tmp, mask); tmp = _mm_mul_ps(fx, constants::cephes_exp_C1.ps); v4sf z = _mm_mul_ps(fx, constants::cephes_exp_C2.ps); x = _mm_sub_ps(x, tmp); x = _mm_sub_ps(x, z); z = _mm_mul_ps(x,x); v4sf y = constants::cephes_exp_p0.ps; y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p1.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p2.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p3.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p4.ps); y = _mm_mul_ps(y, x); y = _mm_add_ps(y, constants::cephes_exp_p5.ps); y = _mm_mul_ps(y, z); y = _mm_add_ps(y, x); y = _mm_add_ps(y, one); /* build 2^n */ emm0 = _mm_cvttps_epi32(fx); emm0 = _mm_add_epi32(emm0, constants::pi32_0x7f.pi); emm0 = _mm_slli_epi32(emm0, 23); v4sf pow2n = _mm_castsi128_ps(emm0); y = _mm_mul_ps(y, pow2n); return y; }

INLINE SVec4 SVec4::Min(const SVec4 &vec1, const SVec4 &vec2) { #ifdef USE_SSE return SVec4( _mm_min_ps( vec1.m_128, vec2.m_128 ) ); #else return{ math::Min(vec1.X(), vec2.X()), math::Min(vec1.Y(), vec2.Y()), math::Min(vec1.Z(), vec2.Z()), math::Min(vec1.W(), vec2.W()) }; #endif }

static void process_clip_sse2(dt_dev_pixelpipe_iop_t *piece, const void *const ivoid, void *const ovoid, const dt_iop_roi_t *const roi_in, const dt_iop_roi_t *const roi_out, const float clip) { if(piece->pipe->dsc.filters) { // raw mosaic const __m128 clipm = _mm_set1_ps(clip); const size_t n = (size_t)roi_out->height * roi_out->width; float *const out = (float *)ovoid; float *const in = (float *)ivoid; #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) #endif for(size_t j = 0; j < (n & ~3u); j += 4) _mm_stream_ps(out + j, _mm_min_ps(clipm, _mm_load_ps(in + j))); _mm_sfence(); // lets see if there's a non-multiple of four rest to process: if(n & 3) for(size_t j = n & ~3u; j < n; j++) out[j] = MIN(clip, in[j]); } else { const __m128 clipm = _mm_set1_ps(clip); const int ch = piece->colors; #ifdef _OPENMP #pragma omp parallel for default(none) schedule(static) #endif for(int j = 0; j < roi_out->height; j++) { float *out = (float *)ovoid + (size_t)ch * roi_out->width * j; float *in = (float *)ivoid + (size_t)ch * roi_in->width * j; for(int i = 0; i < roi_out->width; i++, in += ch, out += ch) { _mm_stream_ps(out, _mm_min_ps(clipm, _mm_set_ps(in[3], in[2], in[1], in[0]))); } } _mm_sfence(); } }

void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF, int imgFOfssetH, int imgFOfssetW, float* filter, float *imgFloatSrc, float *imgFloatDst) { //1. const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); //2. const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0); //3. __declspec(align(16)) __m128 filter_l[FILTER_SIZE]; #pragma omp parallel for for (auto i = 0; i < FILTER_SIZE; i++) { //mind a 4 floatba ugyanazt tölti // float -> m128 konverzió filter_l[i] = _mm_load_ps1(filter + i); } const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2; const auto imgWidthbyte = imgWidth << 2; const auto imgWidthFbyte = imgWidthF << 2; const auto imgLengthbyte = imgHeight * imgWidthbyte; //4. register __declspec(align(16)) __m128 a_sse; //8. reg register __declspec(align(16)) __m128 r_sse; #pragma omp parallel for for (auto row = 0; row < imgLengthbyte; row += 4) { // RGBA komponensek akkumulátora r_sse = _mm_setzero_ps(); // konvolúció minden komponensre for (auto y = 0; y < FILTER_H; y++ ) { r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y])); } a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte); //számítás eredményének limitálása 0-255 közé // kimenetí pixel írása _mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse))))))); } }

/* Adjust MBR to fit all child MBRs */ inline void adjustMbrArraySTRNode(ArraySTRNode nodes[], ulong_t cur) { ArraySTRNode *node, *child; ulong_t k; node = &nodes[cur]; child = &nodes[node->pos]; /* enlarge mbr to include all childlen's mbr */ #ifdef ENABLE_SSE_ADJUST { __m128 v_nlow = _mm_load_ps(child[0].mbr.low); __m128 v_nupp = _mm_load_ps(child[0].mbr.upp); for (k = 1; k < node->len; k++) { v_nlow = _mm_min_ps(v_nlow, _mm_load_ps(child[k].mbr.low)); v_nupp = _mm_max_ps(v_nupp, _mm_load_ps(child[k].mbr.upp)); } _mm_store_ps(node->mbr.low, v_nlow); _mm_store_ps(node->mbr.upp, v_nupp); } #else #ifdef ENABLE_AVX_TEST1 { __m256 v_nmbr = _mm256_loadu_ps((float *)&child[0].mbr); for (k = 1; k < node->len; k++) { __m256 v_cmbr = _mm256_loadu_ps((float *)&child[k].mbr); __m256 v_min = _mm256_min_ps(v_nmbr, v_cmbr); __m256 v_max = _mm256_max_ps(v_nmbr, v_cmbr); v_nmbr = _mm256_permute2f128_ps(v_min, v_max, 0x12); } _mm256_storeu_ps((float *)&node->mbr, v_nmbr); } #else /* copy first child's mbr */ node->mbr = child[0].mbr; for (k = 1; k < node->len; k++) { int i; for (i = 0; i < NDIMS; i++) { if (node->mbr.low[i] > child[k].mbr.low[i]) node->mbr.low[i] = child[k].mbr.low[i]; if (node->mbr.upp[i] < child[k].mbr.upp[i]) node->mbr.upp[i] = child[k].mbr.upp[i]; } } #endif #endif }

static void foo( float(&inout)[8]) { __m128& a = reinterpret_cast<__m128&>(inout[0]); __m128& b = reinterpret_cast<__m128&>(inout[4]); __m128 min = _mm_min_ps(a, b); __m128 max = _mm_max_ps(a, b); a = _mm_shuffle_ps(min, max, _MM_SHUFFLE(0, 1, 0, 1)); b = _mm_shuffle_ps(min, max, _MM_SHUFFLE(2, 3, 2, 3)); min = _mm_min_ps(a, b); max = _mm_max_ps(a, b); a = _mm_shuffle_ps(min, max, _MM_SHUFFLE(0, 2, 0, 2)); b = _mm_shuffle_ps(min, max, _MM_SHUFFLE(1, 3, 1, 3)); min = _mm_min_ps(a, b); max = _mm_max_ps(a, b); a = _mm_shuffle_ps(min, max, _MM_SHUFFLE(0, 1, 0, 1)); b = _mm_shuffle_ps(min, max, _MM_SHUFFLE(2, 3, 2, 3)); const size_t idx[][2] = { { 1, 2 },{ 5, 6 }, { 2, 4 },{ 3, 5 }, { 1, 2 },{ 3, 4 },{ 5, 6 } }; for (size_t i = 0; i < sizeof(idx) / sizeof(idx[0]); ++i) { const float x = inout[idx[i][0]]; const float y = inout[idx[i][1]]; inout[idx[i][0]] = std::min(x, y); inout[idx[i][1]] = std::max(x, y); } }