kw_mat4 kw_mul(kw_mat4 m1, kw_mat4 m2){ __m128 c0 = _mm_set_ps(m2.rc[3][0], m2.rc[2][0], m2.rc[1][0], m2.rc[0][0]); __m128 c1 = _mm_set_ps(m2.rc[3][1], m2.rc[2][1], m2.rc[1][1], m2.rc[0][1]); __m128 c2 = _mm_set_ps(m2.rc[3][2], m2.rc[2][2], m2.rc[1][2], m2.rc[0][2]); __m128 c3 = _mm_set_ps(m2.rc[3][3], m2.rc[2][3], m2.rc[1][3], m2.rc[0][3]); kw_mat4 result; result.rc[0][0] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[0], c0, 0xf1)); result.rc[0][1] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[0], c1, 0xf1)); result.rc[0][2] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[0], c2, 0xf1)); result.rc[0][3] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[0], c3, 0xf1)); result.rc[1][0] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[1], c0, 0xf1)); result.rc[1][1] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[1], c1, 0xf1)); result.rc[1][2] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[1], c2, 0xf1)); result.rc[1][3] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[1], c3, 0xf1)); result.rc[2][0] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[2], c0, 0xf1)); result.rc[2][1] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[2], c1, 0xf1)); result.rc[2][2] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[2], c2, 0xf1)); result.rc[2][3] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[2], c3, 0xf1)); result.rc[3][0] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[3], c0, 0xf1)); result.rc[3][1] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[3], c1, 0xf1)); result.rc[3][2] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[3], c2, 0xf1)); result.rc[3][3] = _mm_cvtss_f32(_mm_dp_ps(m1.simd[3], c3, 0xf1)); return result; }
/** process, all real work is done here. */ void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, void *i, void *o, const dt_iop_roi_t *roi_in, const dt_iop_roi_t *roi_out) { // this is called for preview and full pipe separately, each with its own pixelpipe piece. assert(dt_iop_module_colorspace(self) == iop_cs_Lab); // get our data struct: dt_iop_colorcontrast_params_t *d = (dt_iop_colorcontrast_params_t *)piece->data; // how many colors in our buffer? const int ch = piece->colors; // iterate over all output pixels (same coordinates as input) #ifdef _OPENMP // optional: parallelize it! #pragma omp parallel for default(none) schedule(static) shared(i,o,roi_in,roi_out,d) #endif for(int j=0; j<roi_out->height; j++) { float *in = ((float *)i) + ch*roi_in->width *j; float *out = ((float *)o) + ch*roi_out->width*j; const __m128 scale = _mm_set_ps(0.0f,d->b_steepness,d->a_steepness,1.0f); const __m128 offset = _mm_set_ps(0.0f,d->b_offset,d->a_offset,0.0f); const __m128 min = _mm_set_ps(0.0f,-128.0f,-128.0f, -INFINITY); const __m128 max = _mm_set_ps(0.0f, 128.0f, 128.0f, INFINITY); for(int i=0; i<roi_out->width; i++) { _mm_stream_ps(out,_mm_min_ps(max,_mm_max_ps(min,_mm_add_ps(offset,_mm_mul_ps(scale,_mm_load_ps(in)))))); in+=ch; out+=ch; } } _mm_sfence(); }
/* utest_logf(): Test range/domain of logf */ static void utest_logf(ESL_GETOPTS *go) { __m128 x; /* test input */ union { __m128 v; float x[4]; } r; /* test output */ /* Test IEEE754 specials: * log(-inf) = NaN log(x<0) = NaN log(-0) = NaN * log(0) = -inf log(inf) = inf log(NaN) = NaN */ x = _mm_set_ps(0.0, -0.0, -1.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */ r.v = esl_sse_logf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("logf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (! isnan(r.x[0])) esl_fatal("logf(-inf) should be NaN"); if (! isnan(r.x[1])) esl_fatal("logf(-1) should be NaN"); if (! isnan(r.x[2])) esl_fatal("logf(-0) should be NaN"); if (! (r.x[3] < 0 && isinf(r.x[3]))) esl_fatal("logf(0) should be -inf"); x = _mm_set_ps(FLT_MAX, FLT_MIN, eslNaN, eslINFINITY); r.v = esl_sse_logf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("logf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (! isinf(r.x[0])) esl_fatal("logf(inf) should be inf"); if (! isnan(r.x[1])) esl_fatal("logf(NaN) should be NaN"); }
inline void LimitTurnUP() { float Dlimit=max(0.0f,min(1.0f,(ViewLen+moveZ-3000.0f)/4000.0f)); //TestNum=ViewLen+moveZ; float TurnUP=Easy_vector_dot(_mm_set_ps(1.0f,0.0f,1.0f,0.0f),ViewMat[2]); if(SceneSelect>-1) Dlimit=0.0f; if(TurnUP<0.45f+Dlimit*0.285f) { //ViewUnit.RotInternal(min(0.0f,PosTurn[1]),1.0f,0.0f,0.0f); __m128 ViewUnitMATTMP1[4]; ViewUnit.GetMatrix(ViewUnitMATTMP1); float TurnUPTMP2=Easy_vector_dot(_mm_set_ps(1.0f,0.0f,1.0f,0.0f),ViewUnitMATTMP1[2]); if(SceneSelect==-1) { while(TurnUPTMP2<(0.45f+Dlimit*0.285f)) { ViewUnit.RotInternal(-0.001f,1.0f,0.0f,0.0f); ViewUnit.GetMatrix(ViewUnitMATTMP1); TurnUPTMP2=Easy_vector_dot(_mm_set_ps(1.0f,0.0f,1.0f,0.0f),ViewUnitMATTMP1[2]); } } return; } if(TurnUP>0.75f) { ViewUnit.RotInternal(max(0.0f,PosTurn[1]),1.0f,0.0f,0.0f); return; } ViewUnit.RotInternal(PosTurn[1],1.0f,0.0f,0.0f); }
static void inline histogram_helper_cs_Lab_helper_process_pixel_m128( const dt_dev_histogram_collection_params_t *const histogram_params, const float *pixel, uint32_t *histogram) { const float fscale = (float)(histogram_params->bins_count); const __m128 shift = _mm_set_ps(0.0f, 128.0f, 128.0f, 0.0f); const __m128 scale = _mm_set_ps(fscale / 1.0f, fscale / 256.0f, fscale / 256.0f, fscale / 100.0f); const __m128 val_min = _mm_setzero_ps(); const __m128 val_max = _mm_set1_ps(histogram_params->bins_count - 1); assert(dt_is_aligned(pixel, 16)); const __m128 input = _mm_load_ps(pixel); const __m128 shifted = _mm_add_ps(input, shift); const __m128 scaled = _mm_mul_ps(shifted, scale); const __m128 clamped = _mm_max_ps(_mm_min_ps(scaled, val_max), val_min); const __m128i indexes = _mm_cvtps_epi32(clamped); __m128i values __attribute__((aligned(16))); _mm_store_si128(&values, indexes); const uint32_t *valuesi = (uint32_t *)(&values); histogram[4 * valuesi[0]]++; histogram[4 * valuesi[1] + 1]++; histogram[4 * valuesi[2] + 2]++; }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, __m128 accIx, __m128 accIy, __m128 accIz, float *accI) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); accIx = _mm_add_ps(accIx, aix); accIy = _mm_add_ps(accIy, aiy); accIz = _mm_add_ps(accIz, aiz); _mm_storer_ps(accI, accIx); _mm_storer_ps(accI + 4, accIy); _mm_storer_ps(accI + 8, accIz); }
void NBodyAlgorithm::calculateAcceleration(const float3(&posI)[4], const float massJ, const float3 posJ, float3(&accI)[4]) { __m128 pix = _mm_set_ps(posI[0].x, posI[1].x, posI[2].x, posI[3].x); __m128 piy = _mm_set_ps(posI[0].y, posI[1].y, posI[2].y, posI[3].y); __m128 piz = _mm_set_ps(posI[0].z, posI[1].z, posI[2].z, posI[3].z); __m128 pjx = _mm_set_ps1(posJ.x); __m128 pjy = _mm_set_ps1(posJ.y); __m128 pjz = _mm_set_ps1(posJ.z); __m128 rx = _mm_sub_ps(pjx, pix); __m128 ry = _mm_sub_ps(pjy, piy); __m128 rz = _mm_sub_ps(pjz, piz); __m128 eps2 = _mm_set_ps1(mp_properties->eps2); __m128 rx2 = _mm_mul_ps(rx, rx); __m128 ry2 = _mm_mul_ps(ry, ry); __m128 rz2 = _mm_mul_ps(rz, rz); __m128 rabs = _mm_sqrt_ps(_mm_add_ps(_mm_add_ps(rx2, ry2), _mm_add_ps(rz2, eps2))); __m128 m = _mm_set_ps1(massJ); __m128 rabsInv = _mm_div_ps(m, _mm_mul_ps(_mm_mul_ps(rabs, rabs), rabs)); __m128 aix = _mm_mul_ps(rx, rabsInv); __m128 aiy = _mm_mul_ps(ry, rabsInv); __m128 aiz = _mm_mul_ps(rz, rabsInv); for (int i = 0; i < 4; i++) { accI[3 - i].x = aix.m128_f32[i]; accI[3 - i].y = aiy.m128_f32[i]; accI[3 - i].z = aiz.m128_f32[i]; } }
int main(void) { float a0 = 1.01f; float a1 = 1.02f; float a2 = 1.03f; float a3 = 1.04f; float b0 = 13.33f; float b1 = 13.34f; float b2 = 13.35f; float b3 = 13.36f; float res0 = test_scalar(a0, b0); float res1 = test_scalar(a1, b1); float res2 = test_scalar(a2, b2); float res3 = test_scalar(a3, b3); __m128 av = _mm_set_ps(a0, a1, a2, a3); __m128 bv = _mm_set_ps(b0, b1, b2, b3); // fake use to prevent deletion of target function __m128 resv = test_to_be_generated(av, bv); printf("res (scalar): %f %f %f %f\n", res0, res1, res2, res3); printf("res (packetized): %f %f %f %f\n", ((float*)&resv)[0], ((float*)&resv)[1], ((float*)&resv)[2], ((float*)&resv)[3]); return 0; }
// Sse version of RevbinPermuteInv function. static void RevbinPermuteInvSse(const OMX_F32 *in, OMX_F32 *out, const OMX_F32 *twiddle, OMX_INT n) { OMX_INT i; OMX_INT j; OMX_INT n_by_2 = n >> 1; OMX_INT n_by_4 = n >> 2; const OMX_F32 *tw; const OMX_F32 *pi; const OMX_F32 *pj; VC v_i; VC v_j; VC v_big_a; VC v_big_b; VC v_temp; VC v_tw; for (i = 0, j = n_by_2 - 3; i < n_by_4; i += 4, j -= 4) { pi = in + (i << 1); pj = in + (j << 1); VC_LOAD_INTERLEAVE(&v_i, pi); v_j.real = _mm_set_ps(pj[0], pj[2], pj[4], pj[6]); v_j.imag = _mm_set_ps(pj[1], pj[3], pj[5], pj[7]); // A[k] = (X[k] + X'[N/2 - k]) VC_ADD_SUB(&v_big_a, &v_i, &v_j); // temp = (X[k] - X'[N/2 - k]) VC_SUB_ADD(&v_temp, &v_i, &v_j); // W[k] tw = twiddle + i; VC_LOAD_SPLIT(&v_tw, tw, n); // B[k] = (X[k] - X'[N/2 - k]) * W[k] VC_CONJ_MUL(&v_big_b, &v_temp, &v_tw); // Convert split format to interleaved format. // Z[k] = (A[k] + j * B[k]) (k = 0, ..., N/2 - 1) // The scaling of 1/2 will be merged into to the scaling in // the last step before the output in omxSP_FFTInv_CCSToR_F32_Sfs. VC_ADD_X_STORE_SPLIT((out + i), &v_big_a, &v_big_b, n_by_2); VC_SUB_X_INVERSE_STOREU_SPLIT((out + j), &v_big_a, &v_big_b, n_by_2); } // The n_by_2 complex point out[n_by_4] = 2.0f * in[n_by_2]; out[n_by_4 + n_by_2] = -2.0f * in[n_by_2 + 1]; // The first complex point out[0] = in[0] + in[n]; out[n_by_2] = in[0] - in[n]; }
void mat4::set(float m00, float m01, float m02, float m03, float m10, float m11, float m12, float m13, float m20, float m21, float m22, float m23, float m30, float m31, float m32, float m33) { m[0] = _mm_set_ps(m03, m02, m01, m00); m[1] = _mm_set_ps(m13, m12, m11, m10); m[2] = _mm_set_ps(m23, m22, m21, m20); m[3] = _mm_set_ps(m33, m32, m31, m30); }
/* utest_expf(): Test range/domain of expf */ static void utest_expf(ESL_GETOPTS *go) { __m128 x; /* test input */ union { __m128 v; float x[4]; } r; /* test output */ /* exp(-inf) = 0 exp(-0) = 1 exp(0) = 1 exp(inf) = inf exp(NaN) = NaN */ x = _mm_set_ps(eslINFINITY, 0.0, -0.0, -eslINFINITY); /* set_ps() is in order 3 2 1 0 */ r.v = esl_sse_expf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("expf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (r.x[0] != 0.0f) esl_fatal("expf(-inf) should be 0"); if (r.x[1] != 1.0f) esl_fatal("logf(-0) should be 1"); if (r.x[2] != 1.0f) esl_fatal("logf(0) should be 1"); if (! isinf(r.x[3])) esl_fatal("logf(inf) should be inf"); /* exp(NaN) = NaN exp(large) = inf exp(-large) = 0 exp(1) = exp(1) */ x = _mm_set_ps(1.0f, -666.0f, 666.0f, eslNaN); /* set_ps() is in order 3 2 1 0 */ r.v = esl_sse_expf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("expf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if (! isnan(r.x[0])) esl_fatal("expf(NaN) should be NaN"); if (! isinf(r.x[1])) esl_fatal("expf(large x) should be inf"); if (r.x[2] != 0.0f) esl_fatal("expf(-large x) should be 0"); /* Make sure we are correct around the problematic ~minlogf boundary: * (1) e^x for x < -127.5 log2 + epsilon is 0, because that's our minlogf barrier. * (2) e^x for -127.5 log2 < x < -126.5 log2 is 0 too, but is actually calculated * (3) e^x for -126.5 log2 < x should be finite (and close to FLT_MIN) * * minlogf = -127.5 log(2) + epsilon = -88.3762626647949; * and -126.5 log(2) = -87.68311834 * so for * (1): expf(-88.3763) => 0 * (2): expf(-88.3762) => 0 * (3): expf(-87.6832) => 0 * (4): expf(-87.6831) => <FLT_MIN (subnormal) : ~8.31e-39 (may become 0 in flush-to-zero mode for subnormals) */ x = _mm_set_ps(-88.3763, -88.3762, -87.6832, -87.6831); r.v = esl_sse_expf(x); if (esl_opt_GetBoolean(go, "-v")) { printf("expf"); esl_sse_dump_ps(stdout, x); printf(" ==> "); esl_sse_dump_ps(stdout, r.v); printf("\n"); } if ( r.x[0] >= FLT_MIN) esl_fatal("expf( -126.5 log2 + eps) should be around FLT_MIN"); if ( r.x[1] != 0.0f) esl_fatal("expf( -126.5 log2 - eps) should be 0.0 (by calculation)"); if ( r.x[2] != 0.0f) esl_fatal("expf( -127.5 log2 + eps) should be 0.0 (by calculation)"); if ( r.x[3] != 0.0f) esl_fatal("expf( -127.5 log2 - eps) should be 0.0 (by min bound): %g", r.x[0]); }
int main(void) { // testing dense matrix // /****************************************************/ /* LtkInt2x2 a; */ /* LtkInt4x1 b; */ /* LtkInt4x4 c; */ /* LtkInt4 v, w; */ /* */ /* v = LtkInt4_ltk_vector_new (); */ /* w = LtkInt4_ltk_vector_new (); */ /* */ /* int i; */ /* for (i = 0; i < LtkInt4_ltk_vector_size (); ++i) */ /* { */ /* LtkInt4_ltk_vector_set (v, i, 0); */ /* LtkInt4_ltk_vector_set (w, i, 3); */ /* } */ /* */ /* printf ("v = "); */ /* for (i = 0; i < 4; ++i) */ /* printf ("%d ", LtkInt4_ltk_vector_get (v, i)); */ /* printf ("\n"); */ /* */ /* printf ("w = "); */ /* for (i = 0; i < 4; ++i) */ /* printf ("%d ", LtkInt4_ltk_vector_get (w, i)); */ /* printf ("\n"); */ /****************************************************/ // testing intrinsics // printf("INSTRUCTION SET -> %d\n",INSTRSET); __m128 aligned_float = _mm_setzero_ps(); float *p = &aligned_float; printf("%f,%f,%f,%f\n",p[0],p[1],p[2],p[3]); // adder // __m128 a_1,a_2; a_1 = _mm_set_ps(3,3,3,3); a_2 = _mm_set_ps(1,2,3,4); aligned_float = _mm_add_ps(a_1,a_2); p = &aligned_float; printf("%f,%f,%f,%f\n",p[0],p[1],p[2],p[3]); // testing Objects // //LTK_MATRIX_DECLARE(TypeName, type, r, c); // Object *ob = New(ObjectClass); // int el = ObjectClass->GetElement(ob); return 0; }
static void bar(float (& inout)[8]) { static __m128 first; static __m128 second; static __m128 cmp1; static __m128 cmp2; static __m128 res1; static __m128 res2; static __m128 temp; static __m128 res3; static __m128 res4; static float result1[4]; static float result2[4]; const size_t idx[][2] = { {0, 1}, {3, 2}, {4, 5}, {7, 6}, {0, 2}, {1, 3}, {6, 4}, {7, 5}, {0, 1}, {2, 3}, {5, 4}, {7, 6}, {0, 4}, {1, 5}, {2, 6}, {3, 7}, {0, 2}, {1, 3}, {4, 6}, {5, 7}, {0, 1}, {2, 3}, {4, 5}, {6, 7} }; // 24 = sizeof(idx)/sizeof(idx[0]) for(int i = 0 ; i < 24 ; i+=4) { // the first and second are packed vectors of the i-th element to the i-th +3 // reversed because the _mm_set_ps() reverses the data for some reasons first = _mm_set_ps(inout[idx[i+3][0]], inout[idx[i+2][0]], inout[idx[i+1][0]], inout[idx[i][0]]); second = _mm_set_ps(inout[idx[i+3][1]], inout[idx[i+2][1]], inout[idx[i+1][1]], inout[idx[i][1]]); // cmpge because if cmpgt(greater then) it will be bugged for array with equal data insside ex [1,1,1,1,1] -> [0,0,0,0] cmp1 = _mm_cmpge_ps(first, second); cmp2 = _mm_cmpge_ps(second, first); // the formula // x = (c & y) | (!c & x) // y = (c & x) | (!c & y) // where x and y are elements res1 = _mm_and_ps(second, cmp1); res2 = _mm_and_ps(first, cmp2); res3 = _mm_and_ps(first, cmp1); res4 = _mm_and_ps(second, cmp2); first = _mm_or_ps(res1, res2); second = _mm_or_ps(res3, res4); // put them on the positions _mm_storeu_ps(result1, first); _mm_storeu_ps(result2, second); for(int j = 0 ; j < 4 ; ++j) { inout[idx[i+j][0]] = result1[j]; inout[idx[i+j][1]] = result2[j]; } } }
void test() { const int length = 4; float product[128*4] __attribute__ ((aligned(16))); __m128 x = _mm_set_ps(1.0f,2.0f,3.0f,4.0f); __m128 y = _mm_set_ps(1.0f,2.0f,3.0f,4.0f); __m128 z = _mm_add_ps(x,y); _mm_store_ps(product,z); fprintf(stderr, "%f %f %f %f\n", product[0], product[1], product[2], product[3]); }
inline IsotropicDipoleQuery(const Spectrum &_zr, const Spectrum &_zv, const Spectrum &_sigmaTr, Float Fdt, const Point &p) : Fdt(Fdt), p(p) { zr = _mm_set_ps(_zr[0], _zr[1], _zr[2], 0); zv = _mm_set_ps(_zv[0], _zv[1], _zv[2], 0); sigmaTr = _mm_set_ps(_sigmaTr[0], _sigmaTr[1], _sigmaTr[2], 0); zrSqr = _mm_mul_ps(zr, zr); zvSqr = _mm_mul_ps(zv, zv); result.ps = _mm_setzero_ps(); count = 0; }
void process (struct dt_iop_module_t *self, dt_dev_pixelpipe_iop_t *piece, const void * const ivoid, void *ovoid, const dt_iop_roi_t *roi_in, const dt_iop_roi_t * const roi_out) { dt_develop_t *dev = self->dev; const int ch = piece->colors; const __m128 upper = _mm_set_ps(FLT_MAX, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f, dev->overexposed.upper / 100.0f); const __m128 lower = _mm_set_ps(FLT_MAX, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f, dev->overexposed.lower / 100.0f); const int colorscheme = dev->overexposed.colorscheme; const __m128 upper_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][0]); const __m128 lower_color = _mm_load_ps(dt_iop_overexposed_colors[colorscheme][1]); #ifdef _OPENMP #pragma omp parallel for default(none) shared(ovoid) schedule(static) #endif for(int k=0; k<roi_out->height; k++) { const float *in = ((float *)ivoid) + (size_t)ch*k*roi_out->width; float *out = ((float *)ovoid) + (size_t)ch*k*roi_out->width; for (int j=0; j<roi_out->width; j++,in+=4,out+=4) { const __m128 pixel = _mm_load_ps(in); __m128 isoe = _mm_cmpge_ps(pixel, upper); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); isoe = _mm_or_ps(_mm_unpacklo_ps(isoe, isoe), _mm_unpackhi_ps(isoe, isoe)); __m128 isue = _mm_cmple_ps(pixel, lower); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); isue = _mm_and_ps(_mm_unpacklo_ps(isue, isue), _mm_unpackhi_ps(isue, isue)); __m128 result = _mm_or_ps(_mm_andnot_ps(isoe, pixel), _mm_and_ps(isoe, upper_color)); result = _mm_or_ps(_mm_andnot_ps(isue, result), _mm_and_ps(isue, lower_color)); _mm_stream_ps(out, result); } } _mm_sfence(); if(piece->pipe->mask_display) dt_iop_alpha_copy(ivoid, ovoid, roi_out->width, roi_out->height); }
inline void GDALCopy2WordsSSE(const float* pValueIn, Tout* const &pValueOut) { float fMaxVal, fMinVal; GDALGetDataLimits<float, Tout>(fMaxVal, fMinVal); __m128 xmm = _mm_set_ps(0, 0, pValueIn[1], pValueIn[0]); __m128 xmm_min = _mm_set_ps(0, 0, fMinVal, fMinVal); __m128 xmm_max = _mm_set_ps(0, 0, fMaxVal, fMaxVal); xmm = _mm_min_ps(_mm_max_ps(xmm, xmm_min), xmm_max); pValueOut[0] = _mm_cvtss_si32(xmm); pValueOut[1] = _mm_cvtss_si32(_mm_shuffle_ps(xmm, xmm, _MM_SHUFFLE(0, 0, 0, 1))); }
void M_MatrixTranslatev44_SSE(M_Matrix44 *M, M_Vector3 v) { M_Matrix44 T; T.m1 = _mm_set_ps(v.x, 0.0f, 0.0f, 1.0f); T.m2 = _mm_set_ps(v.y, 0.0f, 1.0f, 0.0f); T.m3 = _mm_set_ps(v.z, 1.0f, 0.0f, 0.0f); T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &T); }
int distance_scan_to_map( map_t * map, scan_t * scan, position_t position) { int npoints = 0; /* number of points where scan matches map */ int64_t sum = 0; /* sum of map values at those points */ /* Pre-compute sine and cosine of angle for rotation */ double position_theta_radians = radians(position.theta_degrees); double costheta = cos(position_theta_radians) * map->scale_pixels_per_mm; double sintheta = sin(position_theta_radians) * map->scale_pixels_per_mm; /* Pre-compute pixel offset for translation */ double pos_x_pix = position.x_mm * map->scale_pixels_per_mm; double pos_y_pix = position.y_mm * map->scale_pixels_per_mm; __m128 sincos128 = _mm_set_ps (costheta, -sintheta, sintheta, costheta); __m128 posxy128 = _mm_set_ps (pos_x_pix, pos_y_pix, pos_x_pix, pos_y_pix); int i = 0; for (i=0; i<scan->npoints; i++) { /* Consider only scan points representing obstacles */ if (scan->value[i] == OBSTACLE) { /* Compute coordinate pair using SSE */ __m128 xy128 = _mm_set_ps (scan->x_mm[i], scan->y_mm[i], scan->x_mm[i], scan->y_mm[i]); xy128 = _mm_mul_ps(sincos128, xy128); xy128 = _mm_hadd_ps(xy128, xy128); xy128 = _mm_add_ps(xy128, posxy128); cs_pos_mmx_t pos; pos.mmx = _mm_cvtps_pi32(xy128); /* Extract coordinates */ int x = pos.pos.x; int y = pos.pos.y; /* Empty the multimedia state to avoid floating-point errors later */ _mm_empty(); /* Add point if in map bounds */ if (x >= 0 && x < map->size_pixels && y >= 0 && y < map->size_pixels) { sum += map->pixels[y * map->size_pixels + x]; npoints++; } } } /* Return sum scaled by number of points, or -1 if none */ return npoints ? (int)(sum * 1024 / npoints) : -1; }
void M_MatrixTranslateX44_SSE(M_Matrix44 *M, M_Real x) { float xf = (float)x; M_Matrix44 T; T.m1 = _mm_set_ps(xf, 0.0f, 0.0f, 1.0f); T.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f); T.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &T); }
void M_MatrixTranslateY44_SSE(M_Matrix44 *M, M_Real y) { float yf = (float)y; M_Matrix44 T; T.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f); T.m2 = _mm_set_ps(yf, 0.0f, 1.0f, 0.0f); T.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &T); }
static inline __m128 dt_Lab_to_XYZ_SSE(const __m128 Lab) { const __m128 d50 = _mm_set_ps(0.0f, 0.8249f, 1.0f, 0.9642f); const __m128 coef = _mm_set_ps(0.0f,-1.0f/200.0f,1.0f/116.0f,1.0f/500.0f); const __m128 offset = _mm_set1_ps(0.137931034f); // last component ins shuffle taken from 1st component of Lab to make sure it is not nan, so it will become 0.0f in f const __m128 f = _mm_mul_ps(_mm_shuffle_ps(Lab,Lab,_MM_SHUFFLE(0,2,0,1)),coef); return _mm_mul_ps(d50,lab_f_inv_m(_mm_add_ps(_mm_add_ps(f,_mm_shuffle_ps(f,f,_MM_SHUFFLE(1,1,3,1))),offset))); }
void M_MatrixTranslateZ44_SSE(M_Matrix44 *M, M_Real z) { float zf = (float)z; M_Matrix44 T; T.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, 1.0f); T.m2 = _mm_set_ps(0.0f, 0.0f, 1.0f, 0.0f); T.m3 = _mm_set_ps(zf, 1.0f, 0.0f, 0.0f); T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &T); }
void M_MatrixUniScale44_SSE(M_Matrix44 *M, M_Real r) { M_Matrix44 S; float f = (float)r; S.m1 = _mm_set_ps(0.0f, 0.0f, 0.0f, f); S.m2 = _mm_shuffle_ps(S.m1,S.m1,_MM_SHUFFLE(3,3,0,3)); S.m3 = _mm_shuffle_ps(S.m1,S.m1,_MM_SHUFFLE(3,0,3,3)); S.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &S); }
RTCRay4 Camera::GetRayPacket4(const int x, const int y) const { RTCRay4 result; auto& pos = m_Position; // assign origins __m128 positionsX = _mm_set1_ps(pos.x); __m128 positionsY = _mm_set1_ps(pos.y); __m128 positionsZ = _mm_set1_ps(pos.z); memcpy(result.orgx, &positionsX, sizeof(positionsX)); memcpy(result.orgy, &positionsY, sizeof(positionsY)); memcpy(result.orgz, &positionsZ, sizeof(positionsZ)); vec3 targets[4]; targets[0] = normalize((m_TopLeft + (m_TopRight - m_TopLeft) * ((x + 1) / (float)FRAME_WIDTH) + (m_DownLeft - m_TopLeft) * ((y + 1) / (float)FRAME_HEIGHT)) - m_Position); targets[1] = normalize((m_TopLeft + (m_TopRight - m_TopLeft) * ((x) / (float)FRAME_WIDTH) + (m_DownLeft - m_TopLeft) * ((y + 1) / (float)FRAME_HEIGHT)) - m_Position); targets[2] = normalize((m_TopLeft + (m_TopRight - m_TopLeft) * ((x + 1) / (float)FRAME_WIDTH) + (m_DownLeft - m_TopLeft) * ((y) / (float)FRAME_HEIGHT)) - m_Position); targets[3] = normalize((m_TopLeft + (m_TopRight - m_TopLeft) * ((x) / (float)FRAME_WIDTH) + (m_DownLeft - m_TopLeft) * ((y) / (float)FRAME_HEIGHT)) - m_Position); // assign directions __m128 directionsX = _mm_set_ps(targets[0].x, targets[1].x, targets[2].x, targets[3].x); __m128 directionsY = _mm_set_ps(targets[0].y, targets[1].y, targets[2].y, targets[3].y); __m128 directionsZ = _mm_set_ps(targets[0].z, targets[1].z, targets[2].z, targets[3].z); memcpy(result.dirx, &directionsX, sizeof(directionsX)); memcpy(result.diry, &directionsY, sizeof(directionsY)); memcpy(result.dirz, &directionsZ, sizeof(directionsZ)); memcpy(result.tfar, &SIMDConstants::infinity, sizeof(SIMDConstants::infinity)); // tfar = inf; memcpy(result.tnear, &SIMDConstants::zero, sizeof(SIMDConstants::zero)); // tnear = 0; memcpy(result.time, &SIMDConstants::zero, sizeof(SIMDConstants::zero)); // time = 0; __m128i geomIDs = _mm_set1_epi32(-1); __m128i primIDs = _mm_set1_epi32(0); memcpy(result.geomID, &geomIDs, sizeof(geomIDs)); // geomIDs = RTC_INVALID_GEOMETRY_ID memcpy(result.primID, &primIDs, sizeof(primIDs)); // geomIDs = RTC_INVALID_GEOMETRY_ID __m128i maskIDs = _mm_set1_epi32(-1); memcpy(result.mask, &maskIDs, sizeof(maskIDs)); // geom mask = -1 return result; }
test (float *v) { __m128 x; x = _mm_set_ps (0, 0, 0, v[0]); check (x, v, 0); x = _mm_set_ps (0, 0, v[1], 0); check (x, v, 1); x = _mm_set_ps (0, v[2], 0, 0); check (x, v, 2); x = _mm_set_ps (v[3], 0, 0, 0); check (x, v, 3); }
void M_MatrixRotate44K_SSE(M_Matrix44 *M, M_Real theta) { float s = sinf((float)theta); float c = cosf((float)theta); M_Matrix44 R; R.m1 = _mm_set_ps(0.0f, 0.0f, -s, c); R.m2 = _mm_set_ps(0.0f, 0.0f, c, s); R.m3 = _mm_set_ps(0.0f, 1.0f, 0.0f, 0.0f); R.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &R); }
void M_MatrixTranslate44_SSE(M_Matrix44 *M, M_Real x, M_Real y, M_Real z) { M_Matrix44 T; float xf = (float)x; float yf = (float)y; float zf = (float)z; T.m1 = _mm_set_ps(xf, 0.0f, 0.0f, 1.0f); T.m2 = _mm_set_ps(yf, 0.0f, 1.0f, 0.0f); T.m3 = _mm_set_ps(zf, 1.0f, 0.0f, 0.0f); T.m4 = _mm_set_ps(1.0f, 0.0f, 0.0f, 0.0f); M_MatrixMult44v_SSE(M, &T); }
void conv_filter_sse(int imgHeight, int imgWidth, int imgHeightF, int imgWidthF, int imgFOfssetH, int imgFOfssetW, float* filter, float *imgFloatSrc, float *imgFloatDst) { //1. const register __declspec(align(16)) auto const_0 = _mm_set_ps(0.0, 0.0, 0.0, 0.0); //2. const register __declspec(align(16)) auto const_255 = _mm_set_ps(255.0, 255.0, 255.0, 255.0); //3. __declspec(align(16)) __m128 filter_l[FILTER_SIZE]; #pragma omp parallel for for (auto i = 0; i < FILTER_SIZE; i++) { //mind a 4 floatba ugyanazt tölti // float -> m128 konverzió filter_l[i] = _mm_load_ps1(filter + i); } const auto rw_base = (imgFOfssetW + imgFOfssetH * imgWidthF) << 2; const auto imgWidthbyte = imgWidth << 2; const auto imgWidthFbyte = imgWidthF << 2; const auto imgLengthbyte = imgHeight * imgWidthbyte; //4. register __declspec(align(16)) __m128 a_sse; //8. reg register __declspec(align(16)) __m128 r_sse; #pragma omp parallel for for (auto row = 0; row < imgLengthbyte; row += 4) { // RGBA komponensek akkumulátora r_sse = _mm_setzero_ps(); // konvolúció minden komponensre for (auto y = 0; y < FILTER_H; y++ ) { r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (y * imgWidthFbyte)), filter_l[5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (4 + y * imgWidthFbyte)), filter_l[1 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (8 + y * imgWidthFbyte)), filter_l[2 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (12 + y * imgWidthFbyte)), filter_l[3 + 5 * y])); r_sse = _mm_add_ps(r_sse, _mm_mul_ps(_mm_load_ps(imgFloatSrc + row + (16 + y * imgWidthFbyte)), filter_l[4 + 5 * y])); } a_sse = _mm_load_ps(imgFloatSrc + row + 8 + 2 * imgWidthFbyte); //számítás eredményének limitálása 0-255 közé // kimenetí pixel írása _mm_store_ps(imgFloatDst + rw_base + row, _mm_min_ps(const_255, _mm_add_ps(a_sse, _mm_max_ps(const_0, _mm_sub_ps(a_sse, _mm_min_ps(const_255, _mm_max_ps(const_0, r_sse))))))); } }
static void TEST (void) { union128 s1, s2; int d[1]; int e[1]; s1.x = _mm_set_ps (24.43, 68.346, 43.35, 546.46); s2.x = _mm_set_ps (1.17, 2.16, 3.15, 4.14); d[0] = test (s1.x, s2.x); e[0] = s1.a[0] > s2.a[0]; if (checkVi (d, e, 1)) abort (); }