static void bar(float (& inout)[8]) { static __m128 first; static __m128 second; static __m128 cmp1; static __m128 cmp2; static __m128 res1; static __m128 res2; static __m128 temp; static __m128 res3; static __m128 res4; static float result1[4]; static float result2[4]; const size_t idx[][2] = { {0, 1}, {3, 2}, {4, 5}, {7, 6}, {0, 2}, {1, 3}, {6, 4}, {7, 5}, {0, 1}, {2, 3}, {5, 4}, {7, 6}, {0, 4}, {1, 5}, {2, 6}, {3, 7}, {0, 2}, {1, 3}, {4, 6}, {5, 7}, {0, 1}, {2, 3}, {4, 5}, {6, 7} }; // 24 = sizeof(idx)/sizeof(idx[0]) for(int i = 0 ; i < 24 ; i+=4) { // the first and second are packed vectors of the i-th element to the i-th +3 // reversed because the _mm_set_ps() reverses the data for some reasons first = _mm_set_ps(inout[idx[i+3][0]], inout[idx[i+2][0]], inout[idx[i+1][0]], inout[idx[i][0]]); second = _mm_set_ps(inout[idx[i+3][1]], inout[idx[i+2][1]], inout[idx[i+1][1]], inout[idx[i][1]]); // cmpge because if cmpgt(greater then) it will be bugged for array with equal data insside ex [1,1,1,1,1] -> [0,0,0,0] cmp1 = _mm_cmpge_ps(first, second); cmp2 = _mm_cmpge_ps(second, first); // the formula // x = (c & y) | (!c & x) // y = (c & x) | (!c & y) // where x and y are elements res1 = _mm_and_ps(second, cmp1); res2 = _mm_and_ps(first, cmp2); res3 = _mm_and_ps(first, cmp1); res4 = _mm_and_ps(second, cmp2); first = _mm_or_ps(res1, res2); second = _mm_or_ps(res3, res4); // put them on the positions _mm_storeu_ps(result1, first); _mm_storeu_ps(result2, second); for(int j = 0 ; j < 4 ; ++j) { inout[idx[i+j][0]] = result1[j]; inout[idx[i+j][1]] = result2[j]; } } }
static inline void boxVerticesToScreenVertices(vec4f vertices[8],const vec4f& screenCenterMul,vec2i screenCenter){ #ifdef ARPHEG_ARCH_X86 __m128 screenSpaceMul = _mm_load_ps((float*)&screenCenterMul.x); __m128 screenCenterOffset = _mm_setr_ps(float(screenCenter.x),float(screenCenter.y),0,0); __m128 nearClip = _mm_setzero_ps(); for(uint32 i = 0;i<8;++i){ __m128 hv = _mm_load_ps((float*)(vertices + i)); __m128 w = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(3,3,3,3)); //get the w component __m128 z = _mm_shuffle_ps(hv,hv,_MM_SHUFFLE(2,2,2,2)); hv = _mm_div_ps(hv,w); //Project XYZW to clip space (divide by w) hv = _mm_mul_ps(hv,screenSpaceMul); //XY to screen space [-width/2,-height/2 -> width/2,height/2] hv = _mm_add_ps(hv,screenCenterOffset);//XY to screen space [0,0 -> width,height] __m128 mNoNearClip = _mm_cmpge_ps(z, nearClip ); //Set to all-0 if near-clipped hv = _mm_and_ps(hv, mNoNearClip); _mm_store_ps((float*)(vertices + i),hv); } #else //TODO ScreenSpaceVertex* screenVerts= (ScreenSpaceVertex*)vertices; for(uint32 i =0;i<8;++i){ vertices[i] = vertices[i] * (1.0f/vertices[i].w) ; auto v = vertices[i] * screenCenterMul; screenVerts[i].pos = vec2i(int32(v.x),int32(v.y))+screenCenter; } #endif }
SIMDValue SIMDFloat32x4Operation::OpGreaterThanOrEqual(const SIMDValue& aValue, const SIMDValue& bValue) { X86SIMDValue x86Result; X86SIMDValue tmpaValue = X86SIMDValue::ToX86SIMDValue(aValue); X86SIMDValue tmpbValue = X86SIMDValue::ToX86SIMDValue(bValue); x86Result.m128_value = _mm_cmpge_ps(tmpaValue.m128_value, tmpbValue.m128_value); // a >= b? return X86SIMDValue::ToSIMDValue(x86Result); }
F32 Aabb::testPlane(const Plane& p) const { const Aabb& aabb = *this; #if ANKI_SIMD == ANKI_SIMD_SSE __m128 gezero = _mm_cmpge_ps(p.getNormal().getSimd(), _mm_setzero_ps()); Vec4 diagMin; diagMin.getSimd() = _mm_or_ps(_mm_and_ps(gezero, aabb.getMin().getSimd()), _mm_andnot_ps(gezero, aabb.getMax().getSimd())); #else Vec4 diagMin(0.0), diagMax(0.0); // set min/max values for x,y,z direction for(U i = 0; i < 3; i++) { if(p.getNormal()[i] >= 0.0) { diagMin[i] = aabb.getMin()[i]; diagMax[i] = aabb.getMax()[i]; } else { diagMin[i] = aabb.getMax()[i]; diagMax[i] = aabb.getMin()[i]; } } #endif // minimum on positive side of plane, box on positive side ANKI_ASSERT(diagMin.w() == 0.0); F32 test = p.test(diagMin); if(test > 0.0) { return test; } #if ANKI_SIMD == ANKI_SIMD_SSE Vec4 diagMax; diagMax.getSimd() = _mm_or_ps(_mm_and_ps(gezero, aabb.getMax().getSimd()), _mm_andnot_ps(gezero, aabb.getMin().getSimd())); #endif ANKI_ASSERT(diagMax.w() == 0.0); test = p.test(diagMax); if(test >= 0.0) { // min on non-positive side, max on non-negative side, intersection return 0.0; } else { // max on negative side, box on negative side return test; } }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out) { dt_develop_t *dev = self->dev; const int ch = piece->colors; const __m128 upper = _mm_set_ps(FLT_MAX, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f); const __m128 lower = _mm_set_ps(FLT_MAX, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f); const int colorscheme = dev->overexposed.colorscheme; const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]); const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]); #ifdef _OPENMP #pragma omp parallel for default(none) shared(ovoid) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width; float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=4,out+=4) { const __m128 pixel = _mm_load_ps(in); __m128 isoe = _mm_cmpge_ps(pixel, upper); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); __m128 isue = _mm_cmple_ps(pixel, lower); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel), _mm_and_ps(isoe, upper_color)); result = _mm_or_ps(_mm_andnot_ps(isue, result), _mm_and_ps(isue, lower_color)); _mm_stream_ps(out, result); } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
// a >= b void _SIMD_cmpge_ps(__SIMD a, __SIMD b, void** resultPtr) { __SIMD* result = (__SIMD*)malloc(sizeof(__SIMD)); *resultPtr = result; #ifdef USE_SSE *result = _mm_cmpge_ps(a,b); #elif defined USE_AVX *result = _mm256_cmp_ps(a,b,29); #elif defined USE_IBM *result = vec_cmpge(a,b); #endif }
SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws) { X86SIMDValue x86Result = { 0 }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue temp, temp2; X86SIMDValue two_31_f4, two_31_i4; int mask = 0; // any lanes < 0 ? temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value); mask = _mm_movemask_ps(temp.m128_value); // negative value are out of range, caller should throw Range Error if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31. // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back. _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value); // any lanes >= 2^31 ? temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value); // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value); // subtract 2^31 from lanes >= 2^31, unchanged otherwise. v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value); // CVTTPS2DQ x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value); // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment) temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ? mask = _mm_movemask_ps(temp2.m128_value); if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // we pass range check // add 2^31 values back to adjusted values. // Use first bit from the 2^31 float mask (0x4f000...0 << 1) // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value); two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1); two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value); // add 2^31 back to adjusted values // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow. x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
static void GF_FUNC_ALIGN VS_CC float_to_dst_16bit(const float *srcp, uint8_t *d, int width, int height, int src_stride, int dst_stride, float th, int bits) { uint16_t *dstp = (uint16_t *)d; dst_stride /= 2; __m128 tmax = _mm_set1_ps(th); int rshift = 32 - bits; for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 8) { __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax); __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax); __m128i xmi0 = _mm_srli_epi32(_mm_castps_si128(xmf0), rshift); __m128i xmi1 = _mm_srli_epi32(_mm_castps_si128(xmf1), rshift); xmi0 = mm_cast_epi32(xmi0, xmi1); _mm_store_si128((__m128i *)(dstp + x), xmi0); } srcp += src_stride; dstp += dst_stride; } }
static void GF_FUNC_ALIGN VS_CC float_to_dst_8bit(const float *srcp, uint8_t *dstp, int width, int height, int src_stride, int dst_stride, float th, int bits) { __m128 tmax = _mm_set1_ps(th); for (int y = 0; y < height; y++) { for (int x = 0; x < width; x += 16) { __m128 xmf0 = _mm_cmpge_ps(_mm_load_ps(srcp + x), tmax); __m128 xmf1 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 4), tmax); __m128 xmf2 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 8), tmax); __m128 xmf3 = _mm_cmpge_ps(_mm_load_ps(srcp + x + 12), tmax); __m128i xmi0 = _mm_packs_epi32(_mm_castps_si128(xmf0), _mm_castps_si128(xmf1)); __m128i xmi1 = _mm_packs_epi32(_mm_castps_si128(xmf2), _mm_castps_si128(xmf3)); xmi0 = _mm_packs_epi16(xmi0, xmi1); _mm_store_si128((__m128i *)(dstp + x), xmi0); } srcp += src_stride; dstp += dst_stride; } }
inline void GDALCopy4WordsSSE(const float* pValueIn, Tout* const &pValueOut) { float fMaxVal, fMinVal; GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal); __m128 xmm = _mm_loadu_ps(pValueIn); __m128 xmm_min = _mm_set1_ps(fMinVal); __m128 xmm_max = _mm_set1_ps(fMaxVal); xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max); #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE __m128 p0d5 = _mm_set1_ps(0.5f); if (std::numeric_limits<Tout>::is_signed) { __m128 m0d5 = _mm_set1_ps(-0.5f); //__m128 mask = _mm_cmpge_ps(xmm, _mm_set1_ps(0.f)); __m128 mask = _mm_cmpge_ps(xmm, p0d5); xmm = _mm_add_ps(xmm, _mm_or_ps(_mm_and_ps(mask, p0d5), _mm_andnot_ps(mask, m0d5))); /* f >= 0.5f ? f + 0.5f : f - 0.5f */ } else { xmm = _mm_add_ps(xmm, p0d5); } #endif #ifdef SSE_USE_SAME_ROUNDING_AS_NON_SSE __m128i xmm_i = _mm_cvttps_epi32 (xmm); #else __m128i xmm_i = _mm_cvtps_epi32(xmm); #endif #if 0 int aTemp[4]; _mm_storeu_si128 ( (__m128i *)aTemp, xmm_i); pValueOut[0] = (Tout)aTemp[0]; pValueOut[1] = (Tout)aTemp[1]; pValueOut[2] = (Tout)aTemp[2]; pValueOut[3] = (Tout)aTemp[3]; #else pValueOut[0] = (Tout)_mm_extract_epi16(xmm_i, 0); pValueOut[1] = (Tout)_mm_extract_epi16(xmm_i, 2); pValueOut[2] = (Tout)_mm_extract_epi16(xmm_i, 4); pValueOut[3] = (Tout)_mm_extract_epi16(xmm_i, 6); #endif }
// Returns { f, g, f, g }, where f = bump0 (t), g = bump1 (t). v4f bumps_t::operator () (float t) const { // Compute all four polynomials by Estrin's method, and mask and combine the // values according to the region of the graph to which t belongs. v4f s = _mm_set1_ps (t); v4f S = load4f (S0); v4f T = load4f (T0); v4f U = load4f (U0); v4f V = load4f (V0); v4f f01 = load4f (c [0]) + load4f (c [1]) * s; v4f f12 = load4f (c [2]) + load4f (c [3]) * s; v4f f = f01 + f12 * s * s; v4f ltS = _mm_cmplt_ps (s, S); v4f geT = _mm_cmpge_ps (s, T); v4f x1 = _mm_andnot_ps (_mm_or_ps (ltS, geT), f); v4f x2 = _mm_and_ps (ltS, U); v4f x3 = _mm_and_ps (geT, V); v4f val = _mm_or_ps (_mm_or_ps (x1, x2), x3); return _mm_hadd_ps (val, val); }
v4f step_t::operator () (float t) const { // Evaluate the polynomial f by Estrin's method. Return // (0 0 0 0) if t < t0, // (f f f f) if t0 <= t < t1, // (1 1 1 1) if t > t1. v4f c4 = load4f (c); v4f one = { 1.0f, 1.0f, 1.0f, 1.0f }; v4f tttt = _mm_set1_ps (t); // t t t t v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t v4f f0 = c4 * tt; // c0 c1*t c2 c3*t v4f ha = _mm_hadd_ps (f0, f0) * tt * tt; v4f f = _mm_hadd_ps (ha, ha); // f f f f v4f f1 = _mm_unpacklo_ps (f, one); // f 1 f 1 v4f tx = load4f (T); // t0 t1 t1 inf v4f lo = _mm_movelh_ps (tx, tx); // t0 t1 t0 t1 v4f hi = _mm_movehl_ps (tx, tx); // t1 inf t1 inf v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi)); v4f val = _mm_and_ps (sel, f1); // f? 1? f? 1? return _mm_hadd_ps (val, val); }
inline vec4 operator>=(vec4 a, vec4 b) { return _mm_cmpge_ps(a, b); }
void sse_matrix(int num_seqs, char **q, int *q_len, int max_q_len, char **r, int *r_len, int max_r_len, float profile[128][128], float gap_open, float gap_extend, float *H, float *F, int *C, float *max_score) { const int depth = 4; __m128 h_simd, e_simd, f_simd, diagonal_simd; __m128 temp_simd, subst_simd; __m128i zeroi = _mm_set_epi32(0, 0, 0, 0); __m128 score_simd = _mm_setzero_ps(); __m128 zero_simd = _mm_setzero_ps(); __m128 one_simd = _mm_set1_ps(1); __m128 gap_open_simd = _mm_set1_ps(gap_open); __m128 gap_extend_simd = _mm_set1_ps(gap_extend); __m128 max_de, max_fz; __m128 cmp_de, cmp_fz, cmp_de_fz; __m128i c; int offset, index, idx, j_depth; int q_len_depth = depth * max_q_len; /* for (int i = 0; i < 4; i++) { printf("query %i:%s\nref. %i:%s\n\n", i, q[i], i, r[i]); } */ h_simd = zero_simd; e_simd = zero_simd; for (int j = 0; j < max_q_len; j++) { j_depth = depth * j; // left value: gap in reference e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), _mm_sub_ps(h_simd, gap_open_simd)); // printf("from left: %0.2f\n", ((float *)&e_simd)[0]); // diagonal value: match or mismatch subst_simd = _mm_set_ps((q_len[3] > j) ? profile[q[3][j]][r[3][0]] : -1000.0f, (q_len[2] > j) ? profile[q[2][j]][r[2][0]] : -1000.0f, (q_len[1] > j) ? profile[q[1][j]][r[1][0]] : -1000.0f, (q_len[0] > j) ? profile[q[0][j]][r[0][0]] : -1000.0f); /* subst_simd = _mm_set_ps(profile[q[3][j]][r[3][0]], profile[q[2][j]][r[2][0]], profile[q[1][j]][r[1][0]], profile[q[0][j]][r[0][0]]); */ diagonal_simd = _mm_add_ps(zero_simd, subst_simd); // printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[0], profile[q[0][j]][r[0][0]], q[0][j], r[0][0], ((float *)&diagonal_simd)[0]); cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd); max_de = _mm_max_ps(diagonal_simd, e_simd); // up value: gap in query f_simd = _mm_max_ps(_mm_sub_ps(zero_simd, gap_extend_simd), _mm_sub_ps(zero_simd, gap_open_simd)); cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd); max_fz = _mm_max_ps(f_simd, zero_simd); // printf("from up: %0.2f\n", ((float *)&f_simd)[0]); // get max. value and save it cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd); h_simd = _mm_max_ps(max_de, max_fz); score_simd = _mm_max_ps(score_simd, h_simd); // printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[0]); // compass (save left, diagonal, up or zero?) c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1); c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1); c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz)); // printf("\t\t\t\t\tcompass: %i\n", ((int *)&c)[0]); // update matrices _mm_store_ps(&H[j_depth], h_simd); _mm_store_ps(&F[j_depth], f_simd); _mm_store_si128((__m128i *)&C[j_depth], c); //_mm_store_ps(&D[j_depth], diagonal_simd); /* offset = j_depth; printf("(row, col) = (%i, %i):\t \t%c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f %c-%c=%0.2f\n", 0, j, q[0][j], r[0][0], profile[q[0][j]][r[0][0]], q[1][j], r[1][0], profile[q[1][j]][r[1][0]], q[2][j], r[2][0], profile[q[2][j]][r[2][0]], q[3][j], r[3][0], profile[q[3][j]][r[3][0]]); printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, H[offset], H[offset+1], H[offset+2], H[offset+3]); printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, D[offset], D[offset+1], D[offset+2], D[offset+3]); printf("(row, col) = (%i, %i):\td\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&diagonal_simd)[0], ((float *)&diagonal_simd)[1], ((float *)&diagonal_simd)[2], ((float *)&diagonal_simd)[3]); printf("(row, col) = (%i, %i):\ts\t%0.2f %0.2f %0.2f %0.2f\n", 0, j, ((float *)&subst_simd)[0], ((float *)&subst_simd)[1], ((float *)&subst_simd)[2], ((float *)&subst_simd)[3]); */ } // printf("\n"); // exit(-1); int target = 0; for (int i = 1; i < max_r_len; i++) { h_simd = zero_simd; e_simd = zero_simd; temp_simd = zero_simd; idx = i * q_len_depth; for (int j = 0; j < max_q_len; j++) { j_depth = depth * j; offset = idx + j_depth; // left value: gap in reference e_simd = _mm_max_ps(_mm_sub_ps(e_simd, gap_extend_simd), _mm_sub_ps(h_simd, gap_open_simd)); // if (i == 3 && j == 3) printf("from left: %0.2f\n", ((float *)&e_simd)[target]); // diagonal value: match or mismatch diagonal_simd = _mm_add_ps(temp_simd, _mm_set_ps((q_len[3] > j && r_len[3] > i) ? profile[q[3][j]][r[3][i]] : -1000.0f, (q_len[2] > j && r_len[2] > i) ? profile[q[2][j]][r[2][i]] : -1000.0f, (q_len[1] > j && r_len[1] > i) ? profile[q[1][j]][r[1][i]] : -1000.0f, (q_len[0] > j && r_len[0] > i) ? profile[q[0][j]][r[0][i]] : -1000.0f) ); cmp_de = _mm_min_ps(_mm_cmpge_ps(diagonal_simd, e_simd), one_simd); max_de = _mm_max_ps(diagonal_simd, e_simd); // if (i == 3 && j == 3) printf("from diagonal: temp = %0.2f %0.2f (%c, %c) -> %0.2f\n", ((float *)&temp_simd)[target], profile[q[target][j]][r[target][i]], q[target][j], r[target][i], ((float *)&diagonal_simd)[target]); // up value: gap in query temp_simd = _mm_load_ps(&H[offset - q_len_depth]); f_simd = _mm_load_ps(&F[j_depth]); f_simd = _mm_max_ps(_mm_sub_ps(f_simd, gap_extend_simd), _mm_sub_ps(temp_simd, gap_open_simd)); cmp_fz = _mm_min_ps(_mm_cmpge_ps(f_simd, zero_simd), one_simd); max_fz = _mm_max_ps(f_simd, zero_simd); // if (i == 3 && j == 3) printf("from up: %0.2f\n", ((float *)&f_simd)[target]); // get max. value cmp_de_fz = _mm_min_ps(_mm_cmpge_ps(max_de, max_fz), one_simd); h_simd = _mm_max_ps(max_de, max_fz); score_simd = _mm_max_ps(score_simd, h_simd); // if (i == 3 && j == 3) printf("\t\t\t\t\tmax. score: %0.2f\n", ((float *)&h_simd)[target]); // compass (save left, diagonal, up or zero?) c = _mm_slli_epi32(_mm_or_si128(zeroi, _mm_cvtps_epi32(cmp_de)), 1); c = _mm_slli_epi32(_mm_or_si128(c, _mm_cvtps_epi32(cmp_fz)), 1); c = _mm_or_si128(c, _mm_cvtps_epi32(cmp_de_fz)); // update matrices _mm_store_ps(&H[offset], h_simd); _mm_store_ps(&F[j_depth], f_simd); _mm_store_si128((__m128i *)&C[offset], c); /* if (j==0) { printf("(row, col) = (%i, %i):\tD\t%0.2f %0.2f %0.2f %0.2f\n", i, j, D[offset], D[offset+1], D[offset+2], D[offset+3]); printf("(row, col) = (%i, %i):\tH\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]); } */ // printf("(row, col) = (%i, %i):\t%0.2f %0.2f %0.2f %0.2f\n", i, j, H[offset], H[offset+1], H[offset+2], H[offset+3]); } // printf("\n"); } _mm_store_ps(max_score, score_simd); /* int rr_len = r_len[0]; int qq_len = q_len[0]; printf("r_len[0] = %i, q_len[0] = %i\n", rr_len, qq_len); printf("sse\n"); for (int i = 0; i < rr_len; i++) { printf("\t"); for (int j = 0; j < qq_len; j++) { printf("%0.2f\t", H[(i * max_q_len * 4) + (j * 4)]); } printf("\n"); } */ /* char filename[200]; for (int i = 0; i < 4; i++) { sprintf(filename, "/tmp/sse1-%i.score", i); save_float_matrix(H, max_q_len, max_r_len, q[i], q_len[i], r[i], r_len[i], i, 4, filename); } */ /* for (int i = 0; i < 4; i++) { printf("score %i:%0.2f\n\n", i, max_score[i]); } */ }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- void TransformedAABBoxSSE::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m128i colOffset = _mm_set_epi32(0, 1, 0, 1); __m128i rowOffset = _mm_set_epi32(0, 0, 1, 1); __m128i fxptZero = _mm_setzero_si128(); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i); // use fixed-point only for X and Y. Avoid work for Z and W. vFxPt4 xFormedFxPtPos[3]; for(int m = 0; m < 3; m++) { xFormedFxPtPos[m].X = _mm_cvtps_epi32(xformedPos[m].X); xFormedFxPtPos[m].Y = _mm_cvtps_epi32(xformedPos[m].Y); xFormedFxPtPos[m].Z = _mm_cvtps_epi32(xformedPos[m].Z); xFormedFxPtPos[m].W = _mm_cvtps_epi32(xformedPos[m].W); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(xFormedFxPtPos[1].Y, xFormedFxPtPos[2].Y); __m128i A1 = _mm_sub_epi32(xFormedFxPtPos[2].Y, xFormedFxPtPos[0].Y); __m128i A2 = _mm_sub_epi32(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].X); __m128i B1 = _mm_sub_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].X); __m128i B2 = _mm_sub_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].X); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[2].Y), _mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[1].Y)); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[2].X, xFormedFxPtPos[0].Y), _mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[2].Y)); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(xFormedFxPtPos[0].X, xFormedFxPtPos[1].Y), _mm_mullo_epi32(xFormedFxPtPos[1].X, xFormedFxPtPos[0].Y)); // Compute triangle area __m128i triArea = _mm_mullo_epi32(A0, xFormedFxPtPos[0].X); triArea = _mm_add_epi32(triArea, _mm_mullo_epi32(B0, xFormedFxPtPos[0].Y)); triArea = _mm_add_epi32(triArea, C0); __m128 oneOverTriArea = _mm_div_ps(_mm_set1_ps(1.0f), _mm_cvtepi32_ps(triArea)); // Use bounding box traversal strategy to determine which pixels to rasterize __m128i startX = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endX = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].X, xFormedFxPtPos[1].X), xFormedFxPtPos[2].X), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENW)); __m128i startY = _mm_and_si128(Max(Min(Min(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(0)), _mm_set1_epi32(0xFFFFFFFE)); __m128i endY = Min(_mm_add_epi32(Max(Max(xFormedFxPtPos[0].Y, xFormedFxPtPos[1].Y), xFormedFxPtPos[2].Y), _mm_set1_epi32(1)), _mm_set1_epi32(SCREENH)); for(int vv = 0; vv < 3; vv++) { // If W (holding 1/w in our case) is not between 0 and 1, // then vertex is behind near clip plane (1.0 in our case. // If W < 1, then verify 1/W > 1 (for W>0), and 1/W < 0 (for W < 0). __m128 nearClipMask0 = _mm_cmple_ps(xformedPos[vv].W, _mm_set1_ps(0.0f)); __m128 nearClipMask1 = _mm_cmpge_ps(xformedPos[vv].W, _mm_set1_ps(1.0f)); __m128 nearClipMask = _mm_or_ps(nearClipMask0, nearClipMask1); if(!_mm_test_all_zeros(*(__m128i*)&nearClipMask, *(__m128i*)&nearClipMask)) { // All four vertices are behind the near plane (we're processing four triangles at a time w/ SSE) *mVisible = true; return; } } // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m128 zz[3], oneOverW[3]; for(int vv = 0; vv < 3; vv++) { zz[vv] = _mm_set1_ps(xformedPos[vv].Z.m128_f32[lane]); oneOverW[vv] = _mm_set1_ps(xformedPos[vv].W.m128_f32[lane]); } __m128 oneOverTotalArea = _mm_set1_ps(oneOverTriArea.m128_f32[lane]); zz[0] *= oneOverTotalArea; zz[1] *= oneOverTotalArea; zz[2] *= oneOverTotalArea; int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m128i aa0 = _mm_set1_epi32(A0.m128i_i32[lane]); __m128i aa1 = _mm_set1_epi32(A1.m128i_i32[lane]); __m128i aa2 = _mm_set1_epi32(A2.m128i_i32[lane]); __m128i bb0 = _mm_set1_epi32(B0.m128i_i32[lane]); __m128i bb1 = _mm_set1_epi32(B1.m128i_i32[lane]); __m128i bb2 = _mm_set1_epi32(B2.m128i_i32[lane]); __m128i cc0 = _mm_set1_epi32(C0.m128i_i32[lane]); __m128i cc1 = _mm_set1_epi32(C1.m128i_i32[lane]); __m128i cc2 = _mm_set1_epi32(C2.m128i_i32[lane]); __m128i aa0Inc = _mm_slli_epi32(aa0, 1); __m128i aa1Inc = _mm_slli_epi32(aa1, 1); __m128i aa2Inc = _mm_slli_epi32(aa2, 1); __m128i row, col; int rowIdx; // To avoid this branching, choose one method to traverse and store the pixel depth if(gVisualizeDepthBuffer) { // Sequentially traverse and store pixel depths contiguously rowIdx = (startYy * SCREENW + startXx); } else { // Tranverse pixels in 2x2 blocks and store 2x2 pixel quad depths contiguously in memory ==> 2*X // This method provides better perfromance rowIdx = (startYy * SCREENW + 2 * startXx); } col = _mm_add_epi32(colOffset, _mm_set1_epi32(startXx)); __m128i aa0Col = _mm_mullo_epi32(aa0, col); __m128i aa1Col = _mm_mullo_epi32(aa1, col); __m128i aa2Col = _mm_mullo_epi32(aa2, col); row = _mm_add_epi32(rowOffset, _mm_set1_epi32(startYy)); __m128i bb0Row = _mm_add_epi32(_mm_mullo_epi32(bb0, row), cc0); __m128i bb1Row = _mm_add_epi32(_mm_mullo_epi32(bb1, row), cc1); __m128i bb2Row = _mm_add_epi32(_mm_mullo_epi32(bb2, row), cc2); __m128i bb0Inc = _mm_slli_epi32(bb0, 1); __m128i bb1Inc = _mm_slli_epi32(bb1, 1); __m128i bb2Inc = _mm_slli_epi32(bb2, 1); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for(int r = startYy; r < endYy; r += 2, row = _mm_add_epi32(row, _mm_set1_epi32(2)), rowIdx = rowIdx + 2 * SCREENW, bb0Row = _mm_add_epi32(bb0Row, bb0Inc), bb1Row = _mm_add_epi32(bb1Row, bb1Inc), bb2Row = _mm_add_epi32(bb2Row, bb2Inc)) { // Compute barycentric coordinates int idx = rowIdx; __m128i alpha = _mm_add_epi32(aa0Col, bb0Row); __m128i beta = _mm_add_epi32(aa1Col, bb1Row); __m128i gama = _mm_add_epi32(aa2Col, bb2Row); int idxIncr; if(gVisualizeDepthBuffer) { idxIncr = 2; } else { idxIncr = 4; } for(int c = startXx; c < endXx; c += 2, idx = idx + idxIncr, alpha = _mm_add_epi32(alpha, aa0Inc), beta = _mm_add_epi32(beta, aa1Inc), gama = _mm_add_epi32(gama, aa2Inc)) { //Test Pixel inside triangle __m128i mask = _mm_cmplt_epi32(fxptZero, _mm_or_si128(_mm_or_si128(alpha, beta), gama)); // Early out if all of this quad's pixels are outside the triangle. if(_mm_test_all_zeros(mask, mask)) { continue; } // Compute barycentric-interpolated depth __m128 depth = _mm_mul_ps(_mm_cvtepi32_ps(alpha), zz[0]); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(beta), zz[1])); depth = _mm_add_ps(depth, _mm_mul_ps(_mm_cvtepi32_ps(gama), zz[2])); __m128 previousDepthValue; if(gVisualizeDepthBuffer) { previousDepthValue = _mm_set_ps(pDepthBuffer[idx], pDepthBuffer[idx + 1], pDepthBuffer[idx + SCREENW], pDepthBuffer[idx + SCREENW + 1]); } else { previousDepthValue = *(__m128*)&pDepthBuffer[idx]; } __m128 depthMask = _mm_cmpge_ps( depth, previousDepthValue); __m128i finalMask = _mm_and_si128( mask, _mm_castps_si128(depthMask)); if(!_mm_test_all_zeros(finalMask, finalMask)) { *mVisible = true; return; //early exit } }//for each column }// for each row }// for each triangle }// for each set of SIMD# triangles }
int main() { float *arr = get_arr(); // [4, 3, 2, 1] float *uarr = get_uarr(); // [5, 4, 3, 2] float *arr2 = get_arr2(); // [4, 3, 2, 1] float *uarr2 = get_uarr2(); // [5, 4, 3, 2] __m128 a = get_a(); // [8, 6, 4, 2] __m128 b = get_b(); // [1, 2, 3, 4] // Check that test data is like expected. Assert(((uintptr_t)arr & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr & 0xF) != 0); // uarr must be unaligned. Assert(((uintptr_t)arr2 & 0xF) == 0); // arr must be aligned by 16. Assert(((uintptr_t)uarr2 & 0xF) != 0); // uarr must be unaligned. // Test that aeq itself works and does not trivially return true on everything. Assert(aeq_("",_mm_load_ps(arr), 4.f, 3.f, 2.f, 0.f, false) == false); #ifdef TEST_M64 Assert(aeq64(u64castm64(0x22446688AACCEEFFULL), 0xABABABABABABABABULL, false) == false); #endif // SSE1 Load instructions: aeq(_mm_load_ps(arr), 4.f, 3.f, 2.f, 1.f); // 4-wide load from aligned address. aeq(_mm_load_ps1(uarr), 2.f, 2.f, 2.f, 2.f); // Load scalar from unaligned address and populate 4-wide. aeq(_mm_load_ss(uarr), 0.f, 0.f, 0.f, 2.f); // Load scalar from unaligned address to lowest, and zero all highest. aeq(_mm_load1_ps(uarr), 2.f, 2.f, 2.f, 2.f); // _mm_load1_ps == _mm_load_ps1 aeq(_mm_loadh_pi(a, (__m64*)uarr), 3.f, 2.f, 4.f, 2.f); // Load two highest addresses, preserve two lowest. aeq(_mm_loadl_pi(a, (__m64*)uarr), 8.f, 6.f, 3.f, 2.f); // Load two lowest addresses, preserve two highest. aeq(_mm_loadr_ps(arr), 1.f, 2.f, 3.f, 4.f); // 4-wide load from an aligned address, but reverse order. aeq(_mm_loadu_ps(uarr), 5.f, 4.f, 3.f, 2.f); // 4-wide load from an unaligned address. // SSE1 Set instructions: aeq(_mm_set_ps(uarr[3], 2.f, 3.f, 4.f), 5.f, 2.f, 3.f, 4.f); // 4-wide set by specifying four immediate or memory operands. aeq(_mm_set_ps1(uarr[3]), 5.f, 5.f, 5.f, 5.f); // 4-wide set by specifying one scalar that is expanded. aeq(_mm_set_ss(uarr[3]), 0.f, 0.f, 0.f, 5.f); // Set scalar at lowest index, zero all higher. aeq(_mm_set1_ps(uarr[3]), 5.f, 5.f, 5.f, 5.f); // _mm_set1_ps == _mm_set_ps1 aeq(_mm_setr_ps(uarr[3], 2.f, 3.f, 4.f), 4.f, 3.f, 2.f, 5.f); // 4-wide set by specifying four immediate or memory operands, but reverse order. aeq(_mm_setzero_ps(), 0.f, 0.f, 0.f, 0.f); // Returns a new zero register. // SSE1 Move instructions: aeq(_mm_move_ss(a, b), 8.f, 6.f, 4.f, 4.f); // Copy three highest elements from a, and lowest from b. aeq(_mm_movehl_ps(a, b), 8.f, 6.f, 1.f, 2.f); // Copy two highest elements from a, and take two highest from b and place them to the two lowest in output. aeq(_mm_movelh_ps(a, b), 3.f, 4.f, 4.f, 2.f); // Copy two lowest elements from a, and take two lowest from b and place them to the two highest in output. // SSE1 Store instructions: #ifdef TEST_M64 /*M64*/*(uint64_t*)uarr = 0xCDCDCDCDCDCDCDCDULL; _mm_maskmove_si64(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xCDEEDDCDCDAA99CDULL); // _mm_maskmove_si64: Conditionally store bytes of a 64-bit value. /*M64*/*(uint64_t*)uarr = 0xABABABABABABABABULL; _m_maskmovq(u64castm64(0x00EEDDCCBBAA9988ULL), u64castm64(0x0080FF7F01FEFF40ULL), (char*)uarr); Assert(*(uint64_t*)uarr == 0xABEEDDABABAA99ABULL); // _m_maskmovq is an alias to _mm_maskmove_si64. #endif _mm_store_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_store_ps: 4-wide store to aligned memory address. _mm_store_ps1(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store_ps1: Store lowest scalar to aligned address, duplicating the element 4 times. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_store_ss(uarr2, b); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 100.f, 4.f); // _mm_store_ss: Store lowest scalar to unaligned address. Don't adjust higher addresses in memory. _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_store1_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 2.f, 2.f, 2.f); // _mm_store1_ps == _mm_store_ps1 _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storeh_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 8.f, 6.f); // _mm_storeh_pi: Store two highest elements to memory. _mm_storeu_ps(uarr2, _mm_set1_ps(100.f)); _mm_storel_pi((__m64*)uarr2, a); aeq(_mm_loadu_ps(uarr2), 100.f, 100.f, 4.f, 2.f); // _mm_storel_pi: Store two lowest elements to memory. _mm_storer_ps(arr2, a); aeq(_mm_load_ps(arr2), 2.f, 4.f, 6.f, 8.f); // _mm_storer_ps: 4-wide store to aligned memory address, but reverse the elements on output. _mm_storeu_ps(uarr2, a); aeq(_mm_loadu_ps(uarr2), 8.f, 6.f, 4.f, 2.f); // _mm_storeu_ps: 4-wide store to unaligned memory address. #ifdef TEST_M64 /*M64*/_mm_stream_pi((__m64*)uarr, u64castm64(0x0080FF7F01FEFF40ULL)); Assert(*(uint64_t*)uarr == 0x0080FF7F01FEFF40ULL); // _mm_stream_pi: 2-wide store, but with a non-temporal memory cache hint. #endif _mm_store_ps(arr2, _mm_set1_ps(100.f)); _mm_stream_ps(arr2, a); aeq(_mm_load_ps(arr2), 8.f, 6.f, 4.f, 2.f); // _mm_stream_ps: 4-wide store, but with a non-temporal memory cache hint. // SSE1 Arithmetic instructions: aeq(_mm_add_ps(a, b), 9.f, 8.f, 7.f, 6.f); // 4-wide add. aeq(_mm_add_ss(a, b), 8.f, 6.f, 4.f, 6.f); // Add lowest element, preserve three highest unchanged from a. aeq(_mm_div_ps(a, _mm_set_ps(2.f, 3.f, 8.f, 2.f)), 4.f, 2.f, 0.5f, 1.f); // 4-wide div. aeq(_mm_div_ss(a, _mm_set_ps(2.f, 3.f, 8.f, 8.f)), 8.f, 6.f, 4.f, 0.25f); // Div lowest element, preserve three highest unchanged from a. aeq(_mm_mul_ps(a, b), 8.f, 12.f, 12.f, 8.f); // 4-wide mul. aeq(_mm_mul_ss(a, b), 8.f, 6.f, 4.f, 8.f); // Mul lowest element, preserve three highest unchanged from a. #ifdef TEST_M64 __m64 m1 = get_m1(); /*M64*/aeq64(_mm_mulhi_pu16(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // Multiply u16 channels, and store high parts. /*M64*/aeq64( _m_pmulhuw(m1, u64castm64(0x22446688AACCEEFFULL)), 0x002233440B4C33CFULL); // _m_pmulhuw is an alias to _mm_mulhi_pu16. __m64 m2 = get_m2(); /*M64*/aeq64(_mm_sad_pu8(m1, m2), 0x368ULL); // Compute abs. differences of u8 channels, and sum those up to a single 16-bit scalar. /*M64*/aeq64( _m_psadbw(m1, m2), 0x368ULL); // _m_psadbw is an alias to _mm_sad_pu8. #endif aeq(_mm_sub_ps(a, b), 7.f, 4.f, 1.f, -2.f); // 4-wide sub. aeq(_mm_sub_ss(a, b), 8.f, 6.f, 4.f, -2.f); // Sub lowest element, preserve three highest unchanged from a. // SSE1 Elementary Math functions: #ifndef __EMSCRIPTEN__ // TODO: Enable support for this to pass. aeq(_mm_rcp_ps(a), 0.124969f, 0.166626f, 0.249939f, 0.499878f); // Compute 4-wide 1/x. aeq(_mm_rcp_ss(a), 8.f, 6.f, 4.f, 0.499878f); // Compute 1/x of lowest element, pass higher elements unchanged. aeq(_mm_rsqrt_ps(a), 0.353455f, 0.408203f, 0.499878f, 0.706909f); // Compute 4-wide 1/sqrt(x). aeq(_mm_rsqrt_ss(a), 8.f, 6.f, 4.f, 0.706909f); // Compute 1/sqrt(x) of lowest element, pass higher elements unchanged. #endif aeq(_mm_sqrt_ps(a), 2.82843f, 2.44949f, 2.f, 1.41421f); // Compute 4-wide sqrt(x). aeq(_mm_sqrt_ss(a), 8.f, 6.f, 4.f, 1.41421f); // Compute sqrt(x) of lowest element, pass higher elements unchanged. __m128 i1 = get_i1(); __m128 i2 = get_i2(); // SSE1 Logical instructions: #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_and_ps(i1, i2), 0x83200100, 0x0fecc988, 0x80244021, 0x13458a88); // 4-wide binary AND aeqi(_mm_andnot_ps(i1, i2), 0x388a9888, 0xf0021444, 0x7000289c, 0x00121046); // 4-wide binary (!i1) & i2 aeqi(_mm_or_ps(i1, i2), 0xbfefdba9, 0xffefdfed, 0xf7656bbd, 0xffffdbef); // 4-wide binary OR aeqi(_mm_xor_ps(i1, i2), 0x3ccfdaa9, 0xf0031665, 0x77412b9c, 0xecba5167); // 4-wide binary XOR #endif // SSE1 Compare instructions: // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeqi(_mm_cmpeq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp == aeqi(_mm_cmpeq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp ==, pass three highest unchanged. aeqi(_mm_cmpge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp >= aeqi(_mm_cmpge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp >=, pass three highest unchanged. aeqi(_mm_cmpgt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp > aeqi(_mm_cmpgt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp >, pass three highest unchanged. aeqi(_mm_cmple_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp <= aeqi(_mm_cmple_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <=, pass three highest unchanged. aeqi(_mm_cmplt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp < aeqi(_mm_cmplt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp <, pass three highest unchanged. aeqi(_mm_cmpneq_ps(a, _mm_set_ps(8.f, 0.f, 4.f, 0.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp != aeqi(_mm_cmpneq_ss(a, _mm_set_ps(8.f, 0.f, 4.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp !=, pass three highest unchanged. aeqi(_mm_cmpnge_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not >= aeqi(_mm_cmpnge_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0); // scalar cmp not >=, pass three highest unchanged. aeqi(_mm_cmpngt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide cmp not > aeqi(_mm_cmpngt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not >, pass three highest unchanged. aeqi(_mm_cmpnle_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0, 0, 0xFFFFFFFF, 0); // 4-wide cmp not <= aeqi(_mm_cmpnle_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 0.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <=, pass three highest unchanged. aeqi(_mm_cmpnlt_ps(a, _mm_set_ps(8.f, 7.f, 3.f, 5.f)), 0xFFFFFFFF, 0, 0xFFFFFFFF, 0); // 4-wide cmp not < aeqi(_mm_cmpnlt_ss(a, _mm_set_ps(8.f, 7.f, 3.f, 2.f)), fcastu(8.f), fcastu(6.f), fcastu(4.f), 0xFFFFFFFF); // scalar cmp not <, pass three highest unchanged. __m128 nan1 = get_nan1(); // [NAN, 0, 0, NAN] __m128 nan2 = get_nan2(); // [NAN, NAN, 0, 0] aeqi(_mm_cmpord_ps(nan1, nan2), 0, 0, 0xFFFFFFFF, 0); // 4-wide test if both operands are not nan. aeqi(_mm_cmpord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0); // scalar test if both operands are not nan, pass three highest unchanged. // Intel Intrinsics Guide documentation is wrong on _mm_cmpunord_ps and _mm_cmpunord_ss. MSDN is right: http://msdn.microsoft.com/en-us/library/khy6fk1t(v=vs.90).aspx aeqi(_mm_cmpunord_ps(nan1, nan2), 0xFFFFFFFF, 0xFFFFFFFF, 0, 0xFFFFFFFF); // 4-wide test if one of the operands is nan. #ifndef __EMSCRIPTEN__ // TODO: The polyfill currently does NaN canonicalization and breaks these. aeqi(_mm_cmpunord_ss(nan1, nan2), fcastu(NAN), 0, 0, 0xFFFFFFFF); // scalar test if one of the operands is nan, pass three highest unchanged. #endif Assert(_mm_comieq_ss(a, b) == 0); Assert(_mm_comieq_ss(a, a) == 1); // Scalar cmp == of lowest element, return int. Assert(_mm_comige_ss(a, b) == 0); Assert(_mm_comige_ss(a, a) == 1); // Scalar cmp >= of lowest element, return int. Assert(_mm_comigt_ss(b, a) == 1); Assert(_mm_comigt_ss(a, a) == 0); // Scalar cmp > of lowest element, return int. Assert(_mm_comile_ss(b, a) == 0); Assert(_mm_comile_ss(a, a) == 1); // Scalar cmp <= of lowest element, return int. Assert(_mm_comilt_ss(a, b) == 1); Assert(_mm_comilt_ss(a, a) == 0); // Scalar cmp < of lowest element, return int. Assert(_mm_comineq_ss(a, b) == 1); Assert(_mm_comineq_ss(a, a) == 0); // Scalar cmp != of lowest element, return int. // The ucomi versions are identical to comi, except that ucomi signal a FP exception only if one of the input operands is a SNaN, whereas the comi versions signal a FP // exception when one of the input operands is either a QNaN or a SNaN. #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomieq_ss(a, b) == 0); Assert(_mm_ucomieq_ss(a, a) == 1); Assert(_mm_ucomieq_ss(a, nan1) == 1); #endif Assert(_mm_ucomige_ss(a, b) == 0); Assert(_mm_ucomige_ss(a, a) == 1); Assert(_mm_ucomige_ss(a, nan1) == 0); Assert(_mm_ucomigt_ss(b, a) == 1); Assert(_mm_ucomigt_ss(a, a) == 0); Assert(_mm_ucomigt_ss(a, nan1) == 0); Assert(_mm_ucomile_ss(b, a) == 0); Assert(_mm_ucomile_ss(a, a) == 1); Assert(_mm_ucomile_ss(a, nan1) == 1); Assert(_mm_ucomilt_ss(a, b) == 1); Assert(_mm_ucomilt_ss(a, a) == 0); Assert(_mm_ucomilt_ss(a, nan1) == 1); #ifndef __EMSCRIPTEN__ // TODO: Fix ucomi support in SSE to treat NaNs properly. Assert(_mm_ucomineq_ss(a, b) == 1); Assert(_mm_ucomineq_ss(a, a) == 0); Assert(_mm_ucomineq_ss(a, nan1) == 0); #endif // SSE1 Convert instructions: __m128 c = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 e = get_e(); // [INF, -INF, 2.5, 3.5] __m128 f = get_f(); // [-1.5, 1.5, -2.5, -9223372036854775808] #ifdef TEST_M64 /*M64*/aeq(_mm_cvt_pi2ps(a, m2), 8.f, 6.f, -19088744.f, 1985229312.f); // 2-way int32 to float conversion to two lowest channels of m128. /*M64*/aeq64(_mm_cvt_ps2pi(c), 0x400000004ULL); // 2-way two lowest floats from m128 to integer, return as m64. #endif aeq(_mm_cvtsi32_ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // Convert int to float, store in lowest channel of m128. aeq( _mm_cvt_si2ss(c, -16777215), 1.5f, 2.5f, 3.5f, -16777215.f); // _mm_cvt_si2ss is an alias to _mm_cvtsi32_ss. #ifndef __EMSCRIPTEN__ // TODO: Fix banker's rounding in cvt functions. Assert(_mm_cvtss_si32(c) == 4); Assert(_mm_cvtss_si32(e) == 4); // Convert lowest channel of m128 from float to int. Assert( _mm_cvt_ss2si(c) == 4); Assert( _mm_cvt_ss2si(e) == 4); // _mm_cvt_ss2si is an alias to _mm_cvtss_si32. #endif #ifdef TEST_M64 /*M64*/aeq(_mm_cvtpi16_ps(m1), 255.f , -32767.f, 4336.f, 14207.f); // 4-way convert int16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpi32_ps(a, m1), 8.f, 6.f, 16744449.f, 284178304.f); // 2-way convert int32s to floats, return in two lowest channels of m128, pass two highest unchanged. /*M64*/aeq(_mm_cvtpi32x2_ps(m1, m2), -19088744.f, 1985229312.f, 16744449.f, 284178304.f); // 4-way convert int32s from two different m64s to float. /*M64*/aeq(_mm_cvtpi8_ps(m1), 16.f, -16.f, 55.f, 127.f); // 4-way convert int8s from lowest end of m64 to float in a m128. /*M64*/aeq64(_mm_cvtps_pi16(c), 0x0002000200040004ULL); // 4-way convert floats to int16s in a m64. /*M64*/aeq64(_mm_cvtps_pi32(c), 0x0000000400000004ULL); // 2-way convert two lowest floats to int32s in a m64. /*M64*/aeq64(_mm_cvtps_pi8(c), 0x0000000002020404ULL); // 4-way convert floats to int8s in a m64, zero higher half of the returned m64. /*M64*/aeq(_mm_cvtpu16_ps(m1), 255.f , 32769.f, 4336.f, 14207.f); // 4-way convert uint16s to floats, return in a m128. /*M64*/aeq(_mm_cvtpu8_ps(m1), 16.f, 240.f, 55.f, 127.f); // 4-way convert uint8s from lowest end of m64 to float in a m128. #endif aeq(_mm_cvtsi64_ss(c, -9223372036854775808ULL), 1.5f, 2.5f, 3.5f, -9223372036854775808.f); // Convert single int64 to float, store in lowest channel of m128, and pass three higher channel unchanged. Assert(_mm_cvtss_f32(c) == 4.5f); // Extract lowest channel of m128 to a plain old float. Assert(_mm_cvtss_si64(f) == -9223372036854775808ULL); // Convert lowest channel of m128 from float to int64. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvtt_ps2pi(e), 0x0000000200000003ULL); aeq64(_mm_cvtt_ps2pi(f), 0xfffffffe80000000ULL); // Truncating conversion from two lowest floats of m128 to int32s, return in a m64. #endif Assert(_mm_cvttss_si32(e) == 3); // Truncating conversion from the lowest float of a m128 to int32. Assert( _mm_cvtt_ss2si(e) == 3); // _mm_cvtt_ss2si is an alias to _mm_cvttss_si32. #ifdef TEST_M64 /*M64*/aeq64(_mm_cvttps_pi32(c), 0x0000000300000004ULL); // Truncating conversion from two lowest floats of m128 to m64. #endif Assert(_mm_cvttss_si64(f) == -9223372036854775808ULL); // Truncating conversion from lowest channel of m128 from float to int64. #ifndef __EMSCRIPTEN__ // TODO: Not implemented. // SSE1 General support: unsigned int mask = _MM_GET_EXCEPTION_MASK(); _MM_SET_EXCEPTION_MASK(mask); unsigned int flushZeroMode = _MM_GET_FLUSH_ZERO_MODE(); _MM_SET_FLUSH_ZERO_MODE(flushZeroMode); unsigned int roundingMode = _MM_GET_ROUNDING_MODE(); _MM_SET_ROUNDING_MODE(roundingMode); unsigned int csr = _mm_getcsr(); _mm_setcsr(csr); unsigned char dummyData[4096]; _mm_prefetch(dummyData, _MM_HINT_T0); _mm_prefetch(dummyData, _MM_HINT_T1); _mm_prefetch(dummyData, _MM_HINT_T2); _mm_prefetch(dummyData, _MM_HINT_NTA); _mm_sfence(); #endif // SSE1 Misc instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_movemask_pi8(m1) == 100); // Return int with eight lowest bits set depending on the highest bits of the 8 uint8 input channels of the m64. /*M64*/Assert( _m_pmovmskb(m1) == 100); // _m_pmovmskb is an alias to _mm_movemask_pi8. #endif Assert(_mm_movemask_ps(_mm_set_ps(-1.f, 0.f, 1.f, NAN)) == 8); Assert(_mm_movemask_ps(_mm_set_ps(-INFINITY, -0.f, INFINITY, -INFINITY)) == 13); // Return int with four lowest bits set depending on the highest bits of the 4 m128 input channels. // SSE1 Probability/Statistics instructions: #ifdef TEST_M64 /*M64*/aeq64(_mm_avg_pu16(m1, m2), 0x7FEE9D4D43A234C8ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pavgw(m1, m2), 0x7FEE9D4D43A234C8ULL); // _m_pavgw is an alias to _mm_avg_pu16. /*M64*/aeq64(_mm_avg_pu8(m1, m2), 0x7FEE9D4D43A23548ULL); // 8-way average uint8s. /*M64*/aeq64( _m_pavgb(m1, m2), 0x7FEE9D4D43A23548ULL); // _m_pavgb is an alias to _mm_avg_pu8. // SSE1 Special Math instructions: /*M64*/aeq64(_mm_max_pi16(m1, m2), 0xFFBA987654377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxsw(m1, m2), 0xFFBA987654377FULL); // _m_pmaxsw is an alias to _mm_max_pi16. /*M64*/aeq64(_mm_max_pu8(m1, m2), 0xFEFFBA9876F0377FULL); // 4-way average uint16s. /*M64*/aeq64( _m_pmaxub(m1, m2), 0xFEFFBA9876F0377FULL); // _m_pmaxub is an alias to _mm_max_pu8. /*M64*/aeq64(_mm_min_pi16(m1, m2), 0xFEDC800110F03210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminsw(m1, m2), 0xFEDC800110F03210ULL); // is an alias to _mm_min_pi16. /*M64*/aeq64(_mm_min_pu8(m1, m2), 0xDC800110543210ULL); // 4-way average uint16s. /*M64*/aeq64( _m_pminub(m1, m2), 0xDC800110543210ULL); // is an alias to _mm_min_pu8. #endif // a = [8, 6, 4, 2], b = [1, 2, 3, 4] aeq(_mm_max_ps(a, b), 8.f, 6.f, 4.f, 4.f); // 4-wide max. aeq(_mm_max_ss(a, _mm_set1_ps(100.f)), 8.f, 6.f, 4.f, 100.f); // Scalar max, pass three highest unchanged. aeq(_mm_min_ps(a, b), 1.f, 2.f, 3.f, 2.f); // 4-wide min. aeq(_mm_min_ss(a, _mm_set1_ps(-100.f)), 8.f, 6.f, 4.f, -100.f); // Scalar min, pass three highest unchanged. // SSE1 Swizzle instructions: #ifdef TEST_M64 /*M64*/Assert(_mm_extract_pi16(m1, 1) == 4336); // Extract the given int16 channel from a m64. /*M64*/Assert( _m_pextrw(m1, 1) == 4336); // _m_pextrw is an alias to _mm_extract_pi16. /*M64*/aeq64(_mm_insert_pi16(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // Insert a int16 to a specific channel of a m64. /*M64*/aeq64( _m_pinsrw(m1, 0xABCD, 1), 0xFF8001ABCD377FULL); // _m_pinsrw is an alias to _mm_insert_pi16. /*M64*/aeq64(_mm_shuffle_pi16(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // Shuffle int16s around in the 4 channels of the m64. /*M64*/aeq64( _m_pshufw(m1, _MM_SHUFFLE(1, 0, 3, 2)), 0x10F0377F00FF8001ULL); // _m_pshufw is an alias to _mm_shuffle_pi16. #endif aeq(_mm_shuffle_ps(a, b, _MM_SHUFFLE(1, 0, 3, 2)), 3.f, 4.f, 8.f, 6.f); aeq(_mm_unpackhi_ps(a, b), 1.f , 8.f, 2.f, 6.f); aeq(_mm_unpacklo_ps(a, b), 3.f , 4.f, 4.f, 2.f); // Transposing a matrix via the xmmintrin.h-provided intrinsic. __m128 c0 = a; // [8, 6, 4, 2] __m128 c1 = b; // [1, 2, 3, 4] __m128 c2 = get_c(); // [1.5, 2.5, 3.5, 4.5] __m128 c3 = get_d(); // [8.5, 6.5, 4.5, 2.5] _MM_TRANSPOSE4_PS(c0, c1, c2, c3); aeq(c0, 2.5f, 4.5f, 4.f, 2.f); aeq(c1, 4.5f, 3.5f, 3.f, 4.f); aeq(c2, 6.5f, 2.5f, 2.f, 6.f); aeq(c3, 8.5f, 1.5f, 1.f, 8.f); // All done! if (numFailures == 0) printf("Success!\n"); else printf("%d tests failed!\n", numFailures); }
inline static void bar(float(&inout)[8]) { __m128 leftSideElements[6], rightSideElements[6], leftGERight[6], leftLTRight[6], leftElementsGE[6], // swaped elements on the left part of comparison leftElementsLT[6], // not-swaped elements on the left part of comparison rightElementsGE[6], // swaped elements on the right part of comparison rightElementsLT[6]; // not-swaped elements on the right part of comparison float resultLeftElements[6][4], resultRightElements[6][4]; const size_t idx[][2] = { { 0, 1 }, { 3, 2 }, { 4, 5 }, { 7, 6 }, { 0, 2 }, { 1, 3 }, { 6, 4 }, { 7, 5 }, { 0, 1 }, { 2, 3 }, { 5, 4 }, { 7, 6 }, { 0, 4 }, { 1, 5 }, { 2, 6 }, { 3, 7 }, { 0, 2 }, { 1, 3 }, { 4, 6 }, { 5, 7 }, { 0, 1 }, { 2, 3 }, { 4, 5 }, { 6, 7 } }; // First row leftSideElements[0] = _mm_set_ps(inout[idx[3][0]], inout[idx[2][0]], inout[idx[1][0]], inout[idx[0][0]]); rightSideElements[0] = _mm_set_ps(inout[idx[3][1]], inout[idx[2][1]], inout[idx[1][1]], inout[idx[0][1]]); leftGERight[0] = _mm_cmpge_ps(leftSideElements[0], rightSideElements[0]); // Something like 0 0 -1 -1. leftLTRight[0] = _mm_cmplt_ps(leftSideElements[0], rightSideElements[0]); // Something like -1 -1 0 0. // Calculates the values of the elements on the left. leftElementsGE[0] = _mm_and_ps(rightSideElements[0], leftGERight[0]); // If the element on left side is bigger or equal to the element on the right side - swaps, so writes the element on the left side to be the element on the right. leftElementsLT[0] = _mm_and_ps(leftSideElements[0], leftLTRight[0]); // If the element on the left side is less than element on the right side - don`t swap and writes the element on left side on it`s place. // Calculates the values of the elements on the right rightElementsGE[0] = _mm_and_ps(leftSideElements[0], leftGERight[0]); // If the element on the left side is bigger or equal to the element on the right side - swaps, so writes on the element on the right side to be the element on the left. rightElementsLT[0] = _mm_and_ps(rightSideElements[0], leftLTRight[0]); // If the element on the left side is less than element on the right side - don`t swap and writes the element on the right side on it`s place. // Now let`s combine the elements, because we have two vectors @leftGERight and @leftLTRight, which are basically inverted, so one OR operation will do it. // (in the @leftElemetnsGE will have something like [0, 0, element, element] and in the @leftElemetnsLT will be [element, element, 0, 0]) leftSideElements[0] = _mm_or_ps(leftElementsGE[0], leftElementsLT[0]); rightSideElements[0] = _mm_or_ps(rightElementsGE[0], rightElementsLT[0]); // Now let`s write them in our array so we can put them in their original places on the given @inout. _mm_storeu_ps(resultLeftElements[0], leftSideElements[0]); _mm_storeu_ps(resultRightElements[0], rightSideElements[0]); // Puts the swaped(if needed) elements on their places. inout[idx[0][0]] = resultLeftElements[0][0]; inout[idx[0][1]] = resultRightElements[0][0]; inout[idx[1][0]] = resultLeftElements[0][1]; inout[idx[1][1]] = resultRightElements[0][1]; inout[idx[2][0]] = resultLeftElements[0][2]; inout[idx[2][1]] = resultRightElements[0][2]; inout[idx[3][0]] = resultLeftElements[0][3]; inout[idx[3][1]] = resultRightElements[0][3]; // Second row leftSideElements[1] = _mm_set_ps(inout[idx[7][0]], inout[idx[6][0]], inout[idx[5][0]], inout[idx[4][0]]); rightSideElements[1] = _mm_set_ps(inout[idx[7][1]], inout[idx[6][1]], inout[idx[5][1]], inout[idx[4][1]]); leftGERight[1] = _mm_cmpge_ps(leftSideElements[1], rightSideElements[1]); leftLTRight[1] = _mm_cmplt_ps(leftSideElements[1], rightSideElements[1]); leftElementsGE[1] = _mm_and_ps(rightSideElements[1], leftGERight[1]); leftElementsLT[1] = _mm_and_ps(leftSideElements[1], leftLTRight[1]); rightElementsGE[1] = _mm_and_ps(leftSideElements[1], leftGERight[1]); rightElementsLT[1] = _mm_and_ps(rightSideElements[1], leftLTRight[1]); leftSideElements[1] = _mm_or_ps(leftElementsGE[1], leftElementsLT[1]); rightSideElements[1] = _mm_or_ps(rightElementsGE[1], rightElementsLT[1]); _mm_storeu_ps(resultLeftElements[1], leftSideElements[1]); _mm_storeu_ps(resultRightElements[1], rightSideElements[1]); inout[idx[4][0]] = resultLeftElements[1][0]; inout[idx[4][1]] = resultRightElements[1][0]; inout[idx[5][0]] = resultLeftElements[1][1]; inout[idx[5][1]] = resultRightElements[1][1]; inout[idx[6][0]] = resultLeftElements[1][2]; inout[idx[6][1]] = resultRightElements[1][2]; inout[idx[7][0]] = resultLeftElements[1][3]; inout[idx[7][1]] = resultRightElements[1][3]; // Third row leftSideElements[2] = _mm_set_ps(inout[idx[11][0]], inout[idx[10][0]], inout[idx[9][0]], inout[idx[8][0]]); rightSideElements[2] = _mm_set_ps(inout[idx[11][1]], inout[idx[10][1]], inout[idx[9][1]], inout[idx[8][1]]); leftGERight[2] = _mm_cmpge_ps(leftSideElements[2], rightSideElements[2]); leftLTRight[2] = _mm_cmplt_ps(leftSideElements[2], rightSideElements[2]); leftElementsGE[2] = _mm_and_ps(rightSideElements[2], leftGERight[2]); leftElementsLT[2] = _mm_and_ps(leftSideElements[2], leftLTRight[2]); rightElementsGE[2] = _mm_and_ps(leftSideElements[2], leftGERight[2]); rightElementsLT[2] = _mm_and_ps(rightSideElements[2], leftLTRight[2]); leftSideElements[2] = _mm_or_ps(leftElementsGE[2], leftElementsLT[2]); rightSideElements[2] = _mm_or_ps(rightElementsGE[2], rightElementsLT[2]); _mm_storeu_ps(resultLeftElements[2], leftSideElements[2]); _mm_storeu_ps(resultRightElements[2], rightSideElements[2]); inout[idx[8][0]] = resultLeftElements[2][0]; inout[idx[8][1]] = resultRightElements[2][0]; inout[idx[9][0]] = resultLeftElements[2][1]; inout[idx[9][1]] = resultRightElements[2][1]; inout[idx[10][0]] = resultLeftElements[2][2]; inout[idx[10][1]] = resultRightElements[2][2]; inout[idx[11][0]] = resultLeftElements[2][3]; inout[idx[11][1]] = resultRightElements[2][3]; // Fourth row leftSideElements[3] = _mm_set_ps(inout[idx[15][0]], inout[idx[14][0]], inout[idx[13][0]], inout[idx[12][0]]); rightSideElements[3] = _mm_set_ps(inout[idx[15][1]], inout[idx[14][1]], inout[idx[13][1]], inout[idx[12][1]]); leftGERight[3] = _mm_cmpge_ps(leftSideElements[3], rightSideElements[3]); leftLTRight[3] = _mm_cmplt_ps(leftSideElements[3], rightSideElements[3]); leftElementsGE[3] = _mm_and_ps(rightSideElements[3], leftGERight[3]); leftElementsLT[3] = _mm_and_ps(leftSideElements[3], leftLTRight[3]); rightElementsGE[3] = _mm_and_ps(leftSideElements[3], leftGERight[3]); rightElementsLT[3] = _mm_and_ps(rightSideElements[3], leftLTRight[3]); leftSideElements[3] = _mm_or_ps(leftElementsGE[3], leftElementsLT[3]); rightSideElements[3] = _mm_or_ps(rightElementsGE[3], rightElementsLT[3]); _mm_storeu_ps(resultLeftElements[3], leftSideElements[3]); _mm_storeu_ps(resultRightElements[3], rightSideElements[3]); inout[idx[12][0]] = resultLeftElements[3][0]; inout[idx[12][1]] = resultRightElements[3][0]; inout[idx[13][0]] = resultLeftElements[3][1]; inout[idx[13][1]] = resultRightElements[3][1]; inout[idx[14][0]] = resultLeftElements[3][2]; inout[idx[14][1]] = resultRightElements[3][2]; inout[idx[15][0]] = resultLeftElements[3][3]; inout[idx[15][1]] = resultRightElements[3][3]; // Fifth row leftSideElements[4] = _mm_set_ps(inout[idx[19][0]], inout[idx[18][0]], inout[idx[17][0]], inout[idx[16][0]]); rightSideElements[4] = _mm_set_ps(inout[idx[19][1]], inout[idx[18][1]], inout[idx[17][1]], inout[idx[16][1]]); leftGERight[4] = _mm_cmpge_ps(leftSideElements[4], rightSideElements[4]); leftLTRight[4] = _mm_cmplt_ps(leftSideElements[4], rightSideElements[4]); leftElementsGE[4] = _mm_and_ps(rightSideElements[4], leftGERight[4]); leftElementsLT[4] = _mm_and_ps(leftSideElements[4], leftLTRight[4]); rightElementsGE[4] = _mm_and_ps(leftSideElements[4], leftGERight[4]); rightElementsLT[4] = _mm_and_ps(rightSideElements[4], leftLTRight[4]); leftSideElements[4] = _mm_or_ps(leftElementsGE[4], leftElementsLT[4]); rightSideElements[4] = _mm_or_ps(rightElementsGE[4], rightElementsLT[4]); _mm_storeu_ps(resultLeftElements[4], leftSideElements[4]); _mm_storeu_ps(resultRightElements[4], rightSideElements[4]); inout[idx[16][0]] = resultLeftElements[4][0]; inout[idx[16][1]] = resultRightElements[4][0]; inout[idx[17][0]] = resultLeftElements[4][1]; inout[idx[17][1]] = resultRightElements[4][1]; inout[idx[18][0]] = resultLeftElements[4][2]; inout[idx[18][1]] = resultRightElements[4][2]; inout[idx[19][0]] = resultLeftElements[4][3]; inout[idx[19][1]] = resultRightElements[4][3]; // Sixth row leftSideElements[5] = _mm_set_ps(inout[idx[23][0]], inout[idx[22][0]], inout[idx[21][0]], inout[idx[20][0]]); rightSideElements[5] = _mm_set_ps(inout[idx[23][1]], inout[idx[22][1]], inout[idx[21][1]], inout[idx[20][1]]); leftGERight[5] = _mm_cmpge_ps(leftSideElements[5], rightSideElements[5]); leftLTRight[5] = _mm_cmplt_ps(leftSideElements[5], rightSideElements[5]); leftElementsGE[5] = _mm_and_ps(rightSideElements[5], leftGERight[5]); leftElementsLT[5] = _mm_and_ps(leftSideElements[5], leftLTRight[5]); rightElementsGE[5] = _mm_and_ps(leftSideElements[5], leftGERight[5]); rightElementsLT[5] = _mm_and_ps(rightSideElements[5], leftLTRight[5]); leftSideElements[5] = _mm_or_ps(leftElementsGE[5], leftElementsLT[5]); rightSideElements[5] = _mm_or_ps(rightElementsGE[5], rightElementsLT[5]); _mm_storeu_ps(resultLeftElements[5], leftSideElements[5]); _mm_storeu_ps(resultRightElements[5], rightSideElements[5]); inout[idx[20][0]] = resultLeftElements[5][0]; inout[idx[20][1]] = resultRightElements[5][0]; inout[idx[21][0]] = resultLeftElements[5][1]; inout[idx[21][1]] = resultRightElements[5][1]; inout[idx[22][0]] = resultLeftElements[5][2]; inout[idx[22][1]] = resultRightElements[5][2]; inout[idx[23][0]] = resultLeftElements[5][3]; inout[idx[23][1]] = resultRightElements[5][3]; }
void cv::updateMotionHistory( InputArray _silhouette, InputOutputArray _mhi, double timestamp, double duration ) { CV_Assert( _silhouette.type() == CV_8UC1 && _mhi.type() == CV_32FC1 ); CV_Assert( _silhouette.sameSize(_mhi) ); float ts = (float)timestamp; float delbound = (float)(timestamp - duration); CV_OCL_RUN(_mhi.isUMat() && _mhi.dims() <= 2, ocl_updateMotionHistory(_silhouette, _mhi, ts, delbound)) Mat silh = _silhouette.getMat(), mhi = _mhi.getMat(); Size size = silh.size(); if( silh.isContinuous() && mhi.isContinuous() ) { size.width *= size.height; size.height = 1; } #if CV_SSE2 volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); #endif for(int y = 0; y < size.height; y++ ) { const uchar* silhData = silh.ptr<uchar>(y); float* mhiData = mhi.ptr<float>(y); int x = 0; #if CV_SSE2 if( useSIMD ) { __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound); for( ; x <= size.width - 8; x += 8 ) { __m128i z = _mm_setzero_si128(); __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z); __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z)); __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4); __m128 fz = _mm_setzero_ps(); v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4)); v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4)); __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz)); __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz)); v0 = _mm_xor_ps(v0, m0); v1 = _mm_xor_ps(v1, m1); _mm_storeu_ps(mhiData + x, v0); _mm_storeu_ps(mhiData + x + 4, v1); } } #endif for( ; x < size.width; x++ ) { float val = mhiData[x]; val = silhData[x] ? ts : val < delbound ? 0 : val; mhiData[x] = val; } } }
IntersectionData intersectRaySpheres(const Ray& ray, const vector<int>& spheresIndices, const Spheres& spheres) { const int maxSpheresToCheck = 4; IntersectionData result; result.intersection = false; result.tIntersection = numeric_limits<float>::max(); int remainder = spheresIndices.size() % maxSpheresToCheck; bool canUseSIMD = (remainder < spheresIndices.size()); int nonSIMDStartPos = 0; if(canUseSIMD) { const int spheresToSIMDCheck = spheresIndices.size() - remainder; nonSIMDStartPos = spheresToSIMDCheck; //Vec4Float a = _mm_set1_ps(1.f); when rayDir is normalized a is 1 Vec4Float b = _mm_set1_ps(0.f); Vec4Float c = b; Vec4Float D = c; Vec4Float centerCoords[3], radiuses; for(int i = 0; i < spheresToSIMDCheck; i += 4) { for(int j = 0; j < 3; ++j) { centerCoords[j] = _mm_set_ps( spheres.centerCoords[j][spheresIndices[i]], spheres.centerCoords[j][spheresIndices[i + 1]], spheres.centerCoords[j][spheresIndices[i + 2]], spheres.centerCoords[j][spheresIndices[i + 3]] ); radiuses = _mm_set_ps( spheres.radiuses[spheresIndices[i]], spheres.radiuses[spheresIndices[i + 1]], spheres.radiuses[spheresIndices[i + 2]], spheres.radiuses[spheresIndices[i + 2]] ); b += 2.f * ray.direction.coords[j] * (ray.origin.coords[j] - centerCoords[j]); c += (ray.origin.coords[j] - centerCoords[j]) * (ray.origin.coords[j] - centerCoords[j]); } D = b * b - 4.f * c; Vec4Float mask = _mm_cmpge_ps(D, _mm_set_ps1(0.f)); Vec4Float squareRootD = _mm_sqrt_ps(D); D = _mm_and_ps(squareRootD, mask); Vec4Float t1, t2; t1 = _mm_or_ps((-b - squareRootD) * 0.5f, _mm_andnot_ps(mask, D)); t2 = _mm_or_ps((-b + squareRootD) * 0.5f, _mm_andnot_ps(mask, D)); float tRes = result.tIntersection; for(int j = 0; j < 4; ++j) { if(t1[j] >= 0 && t1[j] < tRes) { tRes = t1[j]; } if(t2[j] >= 0 && t2[j] < tRes) { tRes = t2[j]; } } if(tRes < result.tIntersection) result.intersection = true; result.tIntersection = tRes; } } for(int i = nonSIMDStartPos; i < spheresIndices.size(); ++i) { IntersectionData data; int idx = spheresIndices[i]; Sphere sphere; sphere.center.x = spheres.centerCoords[0][idx]; sphere.center.y = spheres.centerCoords[1][idx]; sphere.center.z = spheres.centerCoords[2][idx]; sphere.radius = spheres.radiuses[idx]; data = intersectSingleSphere(ray, sphere); if(data.intersection && data.tIntersection < result.tIntersection) { result = data; } } return result; }
inline float4 gte(const float4& a, const float4& b) { return float4(_mm_cmpge_ps(a.data, b.data)); }
RETf CMPGE(const __m128 x, const __m128 y) { return _mm_cmpge_ps(x, y); }
/* motion templates */ CV_IMPL void cvUpdateMotionHistory( const void* silhouette, void* mhimg, double timestamp, double mhi_duration ) { CvMat silhstub, *silh = cvGetMat(silhouette, &silhstub); CvMat mhistub, *mhi = cvGetMat(mhimg, &mhistub); if( !CV_IS_MASK_ARR( silh )) CV_Error( CV_StsBadMask, "" ); if( CV_MAT_TYPE( mhi->type ) != CV_32FC1 ) CV_Error( CV_StsUnsupportedFormat, "" ); if( !CV_ARE_SIZES_EQ( mhi, silh )) CV_Error( CV_StsUnmatchedSizes, "" ); CvSize size = cvGetMatSize( mhi ); int mhi_step = mhi->step; int silh_step = silh->step; if( CV_IS_MAT_CONT( mhi->type & silh->type )) { size.width *= size.height; mhi_step = silh_step = CV_STUB_STEP; size.height = 1; } float ts = (float)timestamp; float delbound = (float)(timestamp - mhi_duration); int x, y; #if CV_SSE2 volatile bool useSIMD = cv::checkHardwareSupport(CV_CPU_SSE2); #endif for( y = 0; y < size.height; y++ ) { const uchar* silhData = silh->data.ptr + silh->step*y; float* mhiData = (float*)(mhi->data.ptr + mhi->step*y); x = 0; #if CV_SSE2 if( useSIMD ) { __m128 ts4 = _mm_set1_ps(ts), db4 = _mm_set1_ps(delbound); for( ; x <= size.width - 8; x += 8 ) { __m128i z = _mm_setzero_si128(); __m128i s = _mm_unpacklo_epi8(_mm_loadl_epi64((const __m128i*)(silhData + x)), z); __m128 s0 = _mm_cvtepi32_ps(_mm_unpacklo_epi16(s, z)), s1 = _mm_cvtepi32_ps(_mm_unpackhi_epi16(s, z)); __m128 v0 = _mm_loadu_ps(mhiData + x), v1 = _mm_loadu_ps(mhiData + x + 4); __m128 fz = _mm_setzero_ps(); v0 = _mm_and_ps(v0, _mm_cmpge_ps(v0, db4)); v1 = _mm_and_ps(v1, _mm_cmpge_ps(v1, db4)); __m128 m0 = _mm_and_ps(_mm_xor_ps(v0, ts4), _mm_cmpneq_ps(s0, fz)); __m128 m1 = _mm_and_ps(_mm_xor_ps(v1, ts4), _mm_cmpneq_ps(s1, fz)); v0 = _mm_xor_ps(v0, m0); v1 = _mm_xor_ps(v1, m1); _mm_storeu_ps(mhiData + x, v0); _mm_storeu_ps(mhiData + x + 4, v1); } } #endif for( ; x < size.width; x++ ) { float val = mhiData[x]; val = silhData[x] ? ts : val < delbound ? 0 : val; mhiData[x] = val; } } }
__m128 test_mm_cmpge_ps(__m128 __a, __m128 __b) { // CHECK-LABEL: @test_mm_cmpge_ps // CHECK: @llvm.x86.sse.cmp.ps(<4 x float> %{{.*}}, <4 x float> %{{.*}}, i8 2) return _mm_cmpge_ps(__a, __b); }
void Permutohedral::init ( const float* feature, int feature_size, int N ) { // Compute the lattice coordinates for each feature [there is going to be a lot of magic here N_ = N; d_ = feature_size; HashTable hash_table( d_, N_/**(d_+1)*/ ); const int blocksize = sizeof(__m128) / sizeof(float); const __m128 invdplus1 = _mm_set1_ps( 1.0f / (d_+1) ); const __m128 dplus1 = _mm_set1_ps( d_+1 ); const __m128 Zero = _mm_set1_ps( 0 ); const __m128 One = _mm_set1_ps( 1 ); // Allocate the class memory if (offset_) delete [] offset_; offset_ = new int[ (d_+1)*(N_+16) ]; memset( offset_, 0, (d_+1)*(N_+16)*sizeof(int) ); if (barycentric_) delete [] barycentric_; barycentric_ = new float[ (d_+1)*(N_+16) ]; memset( barycentric_, 0, (d_+1)*(N_+16)*sizeof(float) ); // Allocate the local memory __m128 * scale_factor = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * f = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * elevated = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rem0 = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rank = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 ); float * barycentric = new float[(d_+2)*blocksize]; short * canonical = new short[(d_+1)*(d_+1)]; short * key = new short[d_+1]; // Compute the canonical simplex for( int i=0; i<=d_; i++ ){ for( int j=0; j<=d_-i; j++ ) canonical[i*(d_+1)+j] = i; for( int j=d_-i+1; j<=d_; j++ ) canonical[i*(d_+1)+j] = i - (d_+1); } // Expected standard deviation of our filter (p.6 in [Adams etal 2010]) float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1); // Compute the diagonal part of E (p.5 in [Adams etal 2010]) for( int i=0; i<d_; i++ ) scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( float((i+2)*(i+1) ) * inv_std_dev) ); // Setup the SSE rounding #ifndef __SSE4_1__ const unsigned int old_rounding = _mm_getcsr(); _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST ); #endif // Compute the simplex each feature lies in for( int k=0; k<N_; k+=blocksize ){ // Load the feature from memory float * ff = (float*)f; for( int j=0; j<d_; j++ ) for( int i=0; i<blocksize; i++ ) ff[ j*blocksize + i ] = k+i < N_ ? feature[ (k+i)*d_+j ] : 0.0; // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010]) // sm contains the sum of 1..n of our faeture vector __m128 sm = Zero; for( int j=d_; j>0; j-- ){ __m128 cf = f[j-1]*scale_factor[j-1]; elevated[j] = sm - _mm_set1_ps(j)*cf; sm += cf; } elevated[0] = sm; // Find the closest 0-colored simplex through rounding __m128 sum = Zero; for( int i=0; i<=d_; i++ ){ __m128 v = invdplus1 * elevated[i]; #ifdef __SSE4_1__ v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT ); #else v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) ); #endif rem0[i] = v*dplus1; sum += v; } // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values) for( int i=0; i<=d_; i++ ) rank[i] = Zero; for( int i=0; i<d_; i++ ){ __m128 di = elevated[i] - rem0[i]; for( int j=i+1; j<=d_; j++ ){ __m128 dj = elevated[j] - rem0[j]; __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) ); rank[i] += c; rank[j] += One-c; } } // If the point doesn't lie on the plane (sum != 0) bring it back for( int i=0; i<=d_; i++ ){ rank[i] += sum; __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) ); __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) ); rank[i] += add-sub; rem0[i] += add-sub; } // Compute the barycentric coordinates (p.10 in [Adams etal 2010]) for( int i=0; i<(d_+2)*blocksize; i++ ) barycentric[ i ] = 0; for( int i=0; i<=d_; i++ ){ __m128 v = (elevated[i] - rem0[i])*invdplus1; // Didn't figure out how to SSE this float * fv = (float*)&v; float * frank = (float*)&rank[i]; for( int j=0; j<blocksize; j++ ){ int p = d_-frank[j]; barycentric[j*(d_+2)+p ] += fv[j]; barycentric[j*(d_+2)+p+1] -= fv[j]; } } // The rest is not SSE'd for( int j=0; j<blocksize; j++ ){ // Wrap around barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1]; float * frank = (float*)rank; float * frem0 = (float*)rem0; // Compute all vertices and their offset for( int remainder=0; remainder<=d_; remainder++ ){ for( int i=0; i<d_; i++ ){ key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ]; } offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true ); barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ]; } } } _mm_free( scale_factor ); _mm_free( f ); _mm_free( elevated ); _mm_free( rem0 ); _mm_free( rank ); delete [] barycentric; delete [] canonical; delete [] key; // Reset the SSE rounding #ifndef __SSE4_1__ _mm_setcsr( old_rounding ); #endif // This is normally fast enough so no SSE needed here // Find the Neighbors of each lattice point // Get the number of vertices in the lattice M_ = hash_table.size(); // Create the neighborhood structure if(blur_neighbors_) delete[] blur_neighbors_; blur_neighbors_ = new Neighbors[ (d_+1)*M_ ]; short * n1 = new short[d_+1]; short * n2 = new short[d_+1]; // For each of d+1 axes, for( int j = 0; j <= d_; j++ ){ for( int i=0; i<M_; i++ ){ const short * key = hash_table.getKey( i ); for( int k=0; k<d_; k++ ){ n1[k] = key[k] - 1; n2[k] = key[k] + 1; } n1[j] = key[j] + d_; n2[j] = key[j] - d_; blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 ); blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 ); } } delete[] n1; delete[] n2; }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { const dt_iop_colorout_data_t *const d = (dt_iop_colorout_data_t *)piece->data; const int ch = piece->colors; const int gamutcheck = (d->softproof_enabled == DT_SOFTPROOF_GAMUTCHECK); if(!isnan(d->cmatrix[0])) { //fprintf(stderr,"Using cmatrix codepath\n"); // convert to rgb using matrix #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + (size_t)ch*roi_in->width *j; float *out = (float*)ovoid + (size_t)ch*roi_out->width*j; const __m128 m0 = _mm_set_ps(0.0f,d->cmatrix[6],d->cmatrix[3],d->cmatrix[0]); const __m128 m1 = _mm_set_ps(0.0f,d->cmatrix[7],d->cmatrix[4],d->cmatrix[1]); const __m128 m2 = _mm_set_ps(0.0f,d->cmatrix[8],d->cmatrix[5],d->cmatrix[2]); for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { const __m128 xyz = dt_Lab_to_XYZ_SSE(_mm_load_ps(in)); const __m128 t = _mm_add_ps(_mm_mul_ps(m0,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(0,0,0,0))),_mm_add_ps(_mm_mul_ps(m1,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(1,1,1,1))),_mm_mul_ps(m2,_mm_shuffle_ps(xyz,xyz,_MM_SHUFFLE(2,2,2,2))))); _mm_stream_ps(out,t); } } _mm_sfence(); // apply profile #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(roi_in,roi_out, ivoid, ovoid) #endif for(int j=0; j<roi_out->height; j++) { float *in = (float*)ivoid + (size_t)ch*roi_in->width *j; float *out = (float*)ovoid + (size_t)ch*roi_out->width*j; for(int i=0; i<roi_out->width; i++, in+=ch, out+=ch ) { for(int i=0; i<3; i++) if (d->lut[i][0] >= 0.0f) { out[i] = (out[i] < 1.0f) ? lerp_lut(d->lut[i], out[i]) : dt_iop_eval_exp(d->unbounded_coeffs[i], out[i]); } } } } else { //fprintf(stderr,"Using xform codepath\n"); const __m128 outofgamutpixel = _mm_set_ps(0.0f, 1.0f, 1.0f, 0.0f); #ifdef _OPENMP #pragma omp parallel for schedule(static) default(none) shared(ivoid, ovoid, roi_out) #endif for (int k=0; k<roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width; float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width; if(!gamutcheck) { cmsDoTransform(d->xform, in, out, roi_out->width); } else { void *rgb = dt_alloc_align(16, 4*sizeof(float)*roi_out->width); cmsDoTransform(d->xform, in, rgb, roi_out->width); float *rgbptr = (float *)rgb; for (int j=0; j<roi_out->width; j++,rgbptr+=4,out+=4) { const __m128 pixel = _mm_load_ps(rgbptr); const __m128 ingamut = _mm_cmpge_ps(pixel, _mm_setzero_ps()); const __m128 result = _mm_or_ps(_mm_andnot_ps(ingamut, outofgamutpixel), _mm_and_ps(ingamut, pixel)); _mm_stream_ps(out, result); } dt_free_align(rgb); } } _mm_sfence(); } if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }