SIMDValue SIMDUint32x4Operation::OpFromFloat32x4(const SIMDValue& value, bool& throws) { X86SIMDValue x86Result = { 0 }; X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); X86SIMDValue temp, temp2; X86SIMDValue two_31_f4, two_31_i4; int mask = 0; // any lanes < 0 ? temp.m128_value = _mm_cmplt_ps(v.m128_value, X86_ALL_ZEROS.m128_value); mask = _mm_movemask_ps(temp.m128_value); // negative value are out of range, caller should throw Range Error if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // CVTTPS2DQ does a range check over signed range [-2^31, 2^31-1], so will fail to convert values >= 2^31. // To fix this, subtract 2^31 from values >= 2^31, do CVTTPS2DQ, then add 2^31 back. _mm_store_ps(two_31_f4.simdValue.f32, X86_TWO_31_F4.m128_value); // any lanes >= 2^31 ? temp.m128_value = _mm_cmpge_ps(v.m128_value, two_31_f4.m128_value); // two_31_f4 has f32(2^31) for lanes >= 2^31, 0 otherwise two_31_f4.m128_value = _mm_and_ps(two_31_f4.m128_value, temp.m128_value); // subtract 2^31 from lanes >= 2^31, unchanged otherwise. v.m128_value = _mm_sub_ps(v.m128_value, two_31_f4.m128_value); // CVTTPS2DQ x86Result.m128i_value = _mm_cvttps_epi32(v.m128_value); // check if any value is out of range (i.e. >= 2^31, meaning originally >= 2^32 before value adjustment) temp2.m128i_value = _mm_cmpeq_epi32(x86Result.m128i_value, X86_NEG_MASK_F4.m128i_value); // any value == 0x80000000 ? mask = _mm_movemask_ps(temp2.m128_value); if (mask) { throws = true; return X86SIMDValue::ToSIMDValue(x86Result); } // we pass range check // add 2^31 values back to adjusted values. // Use first bit from the 2^31 float mask (0x4f000...0 << 1) // and result with 2^31 int mask (0x8000..0) setting first bit to zero if lane hasn't been adjusted _mm_store_ps(two_31_i4.simdValue.f32, X86_TWO_31_I4.m128_value); two_31_f4.m128i_value = _mm_slli_epi32(two_31_f4.m128i_value, 1); two_31_i4.m128i_value = _mm_and_si128(two_31_i4.m128i_value, two_31_f4.m128i_value); // add 2^31 back to adjusted values // Note at this point all values are in [0, 2^31-1]. Adding 2^31 is guaranteed not to overflow. x86Result.m128i_value = _mm_add_epi32(x86Result.m128i_value, two_31_i4.m128i_value); return X86SIMDValue::ToSIMDValue(x86Result); }
/// Test whether this frustum intersects a given sphere in world space. /// /// @param[in] rSphere Sphere to test. /// /// @return True if the sphere intersects this frustum, false if not. bool Helium::Simd::Frustum::Intersects( const Sphere& rSphere ) const { Helium::Simd::Register sphereVec = rSphere.GetSimdVector(); Vector3Soa center( _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ), _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ), _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 2, 2, 2, 2 ) ) ); Helium::Simd::Register radius = _mm_shuffle_ps( sphereVec, sphereVec, _MM_SHUFFLE( 3, 3, 3, 3 ) ); Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros(); PlaneSoa planes; for( size_t basePlaneIndex = 0; basePlaneIndex < PLANE_ARRAY_SIZE; basePlaneIndex += 4 ) { planes.Load( m_planeA + basePlaneIndex, m_planeB + basePlaneIndex, m_planeC + basePlaneIndex, m_planeD + basePlaneIndex ); Helium::Simd::Register distances = Helium::Simd::AddF32( planes.GetDistance( center ), radius ); int resultMask = _mm_movemask_ps( Helium::Simd::GreaterEqualsF32( distances, zeroVec ) ); if( resultMask != 0xf ) { return false; } } return true; }
/*Rewritten member function using the SSE Instructons */ __m128 member_speed(__m128 cx_m , __m128 cy_m) { __m128 x = _mm_set1_ps(0.0f); __m128 y = _mm_set1_ps(0.0f); __m128 four_iter = _mm_set1_ps(0.0); __m128 temp_mask = _mm_set1_ps(0.0); __m128 mask = _mm_set1_ps(1.0); __m128 two_squared = _mm_set1_ps(4.0f); __m128 two = _mm_set1_ps(2.0f); __m128 x_sqr, y_sqr; x_sqr = _mm_mul_ps(x, x); y_sqr = _mm_mul_ps(y, y); // little bit of a hack to deal with individual iterations int iterations = 0; while ( (_mm_movemask_ps( temp_mask = _mm_cmplt_ps(_mm_add_ps(x_sqr, y_sqr),two_squared)) != 0) && (iterations < MAX_ITS) ){ __m128 xtemp = _mm_add_ps(_mm_sub_ps(x_sqr, y_sqr), cx_m); y = _mm_add_ps(_mm_mul_ps(two, _mm_mul_ps(x, y)), cy_m); x = xtemp; x_sqr = _mm_mul_ps(x, x); y_sqr = _mm_mul_ps(y, y); iterations ++; four_iter = _mm_add_ps(four_iter, _mm_and_ps(temp_mask, mask)); } //This returns a m128 with the four iterations! return four_iter; }
void bp_ray_trace_packet (const ray4_t *ray, vector_t *colors, simd4i_t srcprim_id, int depth, simd4_t fdepth) { unsigned int activeMask; intersect4_t isect4; ASSIGN (colors [0], background); ASSIGN (colors [1], background); ASSIGN (colors [2], background); ASSIGN (colors [3], background); if ((depth > curr_scene->settings.max_trace_level) | (simd4_extract_sign (simd4_float_lt (fdepth, simd4_from_float (curr_scene->settings.adc_bailout))) == 0xf)) { return; } isect4.prim_id = simd4i_minus_ones; bp_kd_tree_packet_find_nearest (curr_scene->kd_tree_root, ray, &isect4); activeMask = _mm_movemask_ps (simd4_float_eq (_mm_cvtepi32_ps (isect4.prim_id), simd4f_minus_ones)); /* If there was no intersection terminate early */ if (activeMask == 0xF) return; bp_shade_packet (curr_scene, &isect4, ray, colors, depth, fdepth, srcprim_id); }
/// Test whether this frustum fully contains a given point in world space. /// /// @param[in] rPoint Point to test. /// /// @return True if the point is within this frustum, false if not. bool Helium::Simd::Frustum::Contains( const Vector3& rPoint ) const { // Test the point against each plane set. Vector3Soa pointSplat( rPoint ); Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros(); PlaneSoa planes; for( size_t basePlaneIndex = 0; basePlaneIndex < PLANE_ARRAY_SIZE; basePlaneIndex += 4 ) { planes.Load( m_planeA + basePlaneIndex, m_planeB + basePlaneIndex, m_planeC + basePlaneIndex, m_planeD + basePlaneIndex ); Helium::Simd::Register distances = planes.GetDistance( pointSplat ); int resultMask = _mm_movemask_ps( Helium::Simd::GreaterEqualsF32( distances, zeroVec ) ); if( resultMask != 0xf ) { return false; } } return true; }
int searchSIMDTree(int32_t **tree, int *fanout, int levels, int32_t value) { int iLevel = 0; int lOffset = 0; int pOffset = 0; int32_t cmpmask = 0; int32_t eqmask = 0; __m128i key = _mm_cvtsi32_si128(value); key = _mm_shuffle_epi32(key, _MM_SHUFFLE(0,0,0,0)); while (iLevel < levels) { int f = fanout[iLevel]; pOffset = lOffset; lOffset *= f - 1; int iter = 0; int position = 0; while (iter < f/4) { __m128i delimiters = _mm_load_si128((__m128i const*)&tree[iLevel][lOffset + iter*4]); __m128i compare = _mm_cmpgt_epi32(key, delimiters); cmpmask = _mm_movemask_ps(_mm_castsi128_ps(compare)); cmpmask ^= 0x0F; if (cmpmask) { position = _bit_scan_forward(cmpmask); break; } iter++; } int offset = lOffset + iter*4 + position; lOffset = offset + pOffset; iLevel++; } return lOffset; }
void mandel_sse2(unsigned char *image, const struct spec *s) { __m128 xmin = _mm_set_ps1(s->xlim[0]); __m128 ymin = _mm_set_ps1(s->ylim[0]); __m128 xscale = _mm_set_ps1((s->xlim[1] - s->xlim[0]) / s->width); __m128 yscale = _mm_set_ps1((s->ylim[1] - s->ylim[0]) / s->height); __m128 threshold = _mm_set_ps1(4); __m128 one = _mm_set_ps1(1); __m128 iter_scale = _mm_set_ps1(1.0f / s->iterations); __m128 depth_scale = _mm_set_ps1(s->depth - 1); #pragma omp parallel for schedule(dynamic, 1) for (int y = 0; y < s->height; y++) { for (int x = 0; x < s->width; x += 4) { __m128 mx = _mm_set_ps(x + 3, x + 2, x + 1, x + 0); __m128 my = _mm_set_ps1(y); __m128 cr = _mm_add_ps(_mm_mul_ps(mx, xscale), xmin); __m128 ci = _mm_add_ps(_mm_mul_ps(my, yscale), ymin); __m128 zr = cr; __m128 zi = ci; int k = 1; __m128 mk = _mm_set_ps1(k); while (++k < s->iterations) { /* Compute z1 from z0 */ __m128 zr2 = _mm_mul_ps(zr, zr); __m128 zi2 = _mm_mul_ps(zi, zi); __m128 zrzi = _mm_mul_ps(zr, zi); /* zr1 = zr0 * zr0 - zi0 * zi0 + cr */ /* zi1 = zr0 * zi0 + zr0 * zi0 + ci */ zr = _mm_add_ps(_mm_sub_ps(zr2, zi2), cr); zi = _mm_add_ps(_mm_add_ps(zrzi, zrzi), ci); /* Increment k */ zr2 = _mm_mul_ps(zr, zr); zi2 = _mm_mul_ps(zi, zi); __m128 mag2 = _mm_add_ps(zr2, zi2); __m128 mask = _mm_cmplt_ps(mag2, threshold); mk = _mm_add_ps(_mm_and_ps(mask, one), mk); /* Early bailout? */ if (_mm_movemask_ps(mask) == 0) break; } mk = _mm_mul_ps(mk, iter_scale); mk = _mm_sqrt_ps(mk); mk = _mm_mul_ps(mk, depth_scale); __m128i pixels = _mm_cvtps_epi32(mk); unsigned char *dst = image + y * s->width * 3 + x * 3; unsigned char *src = (unsigned char *)&pixels; for (int i = 0; i < 4; i++) { dst[i * 3 + 0] = src[i * 4]; dst[i * 3 + 1] = src[i * 4]; dst[i * 3 + 2] = src[i * 4]; } } } }
int64_t nv_float_nonzero_index(const float *v, int64_t s, int64_t e) { #if NV_ENABLE_SSE2 { __m128 zero, xmm; int64_t i = s, e2; int eq; if (s & 0x3) { e2 = (s & 0xfffffffffffffffcLL) + 4; for (; i < e2 && i < e; ++i) { if (v[i] != 0.0f) { return i; } } } zero = _mm_setzero_ps(); e2 = (e & 0xfffffffffffffffcLL); for (; i < e2; i += 4) { xmm = _mm_load_ps(&v[i]); xmm = _mm_cmpneq_ps(zero, xmm); eq = _mm_movemask_ps(xmm); if (eq != 0) { if (eq & 0x03) { if (eq & 0x1) { return i; } else { return i + 1; } } else { if (eq & 0x4) { return i + 2; } else { return i + 3; } } } } for (;i < e; ++i) { if (v[i] != 0.0f) { return i; } } return -1; } #else { int64_t i; for (i = s; i < e; ++i) { if (v[i] != 0.0f) { return i; } } return -1; } #endif }
int64_t nv_float_find_index(const float *v, int64_t s, int64_t e, float key) { #if NV_ENABLE_SSE2 { __m128 xkey, xmm; int64_t i = s, e2; int eq; if (s & 0x3) { e2 = (s & 0xfffffffffffffffcLL) + 4; for (; i < e2 && i < e; ++i) { if (v[i] == key) { return i; } } } xkey = _mm_set1_ps(key); e2 = (e & 0xfffffffffffffffcLL); for (; i < e2; i += 4) { xmm = _mm_load_ps(&v[i]); xmm = _mm_cmpeq_ps(xkey, xmm); eq = _mm_movemask_ps(xmm); if (eq != 0) { if (eq & 0x03) { if (eq & 0x1) { return i; } else { return i + 1; } } else { if (eq & 0x4) { return i + 2; } else { return i + 3; } } } } for (;i < e; ++i) { if (v[i] == key) { return i; } } return -1; } #else { int64_t i; for (i = s; i < e; ++i) { if (v[i] == key) { return i; } } return -1; } #endif }
// Get SignMask int SIMDInt32x4Operation::OpGetSignMask(const SIMDValue& value) { X86SIMDValue v = X86SIMDValue::ToX86SIMDValue(value); // Creates a 4-bit mask from the most significant bits of // the 4 single-precision, floating-point values // SIMD review: no suitable integer intrinsics, the float version seems working fine return _mm_movemask_ps(v.m128_value); }
void Tree<ElementContainer>::TraversePacket(Context<size,flags> &c,const Selector<size> &selector) const { bool split=1; enum { reflected=!(flags&(isct::fPrimary|isct::fShadow)) }; bool selectorsFiltered=size<=(reflected?4:isComplex?64:16); if(!Selector<size>::full) for(int n=0;n<size/4;n++) if(selector.Mask4(n)!=0x0f0f0f0f) { selectorsFiltered=0; break; } if((Selector<size>::full||selectorsFiltered) && size <= (reflected?4 : isComplex? 64 : 16)) { const Vec3q &dir=c.Dir(0); bool signsFiltered=1; int msk=_mm_movemask_ps(_mm_shuffle_ps(_mm_shuffle_ps(dir.x.m,dir.y.m,0),dir.z.m,0+(2<<2)))&7; if(filterSigns) { for(int n=0;n<size;n++) if(GetVecSign(c.Dir(n))!=msk) { signsFiltered=0; break; } } if(signsFiltered) { bool primary = (flags & (isct::fPrimary|isct::fShadow)) && gVals[1]; if((flags & isct::fShadow) &&!isComplex) { floatq dot=1.0f; for(int q=1;q<size;q++) dot=Min(dot,c.Dir(0)|c.Dir(q)); if(ForAny(dot<0.9998f)) primary=0; } if(separateFirstElement) elements[0].Collide(c,0); if(primary) TraversePrimary(c); else TraversePacket0(c); // if(primary && (flags & isct::fShadow)) c.stats.Skip(); split=0; } } if(split) { for(int q=0;q<4;q++) { Context<size/4,flags> subC(c.Split(q)); if(flags & isct::fShadow) subC.shadowCache=c.shadowCache; TraversePacket(subC,selector.SubSelector(q)); if(flags & isct::fShadow) c.shadowCache=subC.shadowCache; } } }
/** @overload */ CV_INLINE int cvCeil( float value ) { #if (defined _MSC_VER && defined _M_X64 || (defined __GNUC__ && defined __SSE2__&& !defined __APPLE__)) && !defined(__CUDACC__) __m128 t = _mm_set_ss( value ); int i = _mm_cvtss_si32(t); return i + _mm_movemask_ps(_mm_cmplt_ss(_mm_cvtsi32_ss(t,i), t)); #elif defined __GNUC__ int i = (int)value; return i + (i < value); #else int i = cvRound(value); float diff = (float)(i - value); return i + (diff < 0); #endif }
// Operators INLINE bool SVec4::operator==(const SVec4 &rhs) const { #ifdef USE_SSE SIMDvec dif = _mm_sub_ps( m_128, rhs.m_128 ); SIMDvec ep = _mm_set1_ps( math::Epsilon ); SIMDvec neg_ep = _mm_set1_ps( -math::Epsilon ); return ( 0xf == _mm_movemask_ps( _mm_and_ps( _mm_cmpgt_ps( ep, dif ), _mm_cmplt_ps( neg_ep, dif ) ) ) ); #else return math::IsEqual(m_x, rhs.X()) && math::IsEqual(m_y, rhs.Y()) && math::IsEqual(m_z, rhs.Z()) && math::IsEqual(m_w, rhs.W()); #endif }
__forceinline int _aabbOverlapsFroxel(simdvec3 aabb_center, simdvec3 aabb_extent, const simdvec3* frustum_planes_xyz, const simdfloat* frustum_planes_w, int num_planes) { simdbool test = true; for (int i_plane = 0; i_plane < num_planes; ++i_plane) { simdvec3 plane_normal = frustum_planes_xyz[i_plane]; simdfloat d = dot(aabb_center, plane_normal); simdfloat r = dot(aabb_extent, abs(plane_normal)); simdbool is_inside = simdbool((d + r) >= -frustum_planes_w[i_plane]); test &= is_inside; } return _mm_movemask_ps(test.val); }
__forceinline int _overlap(__m128 sse_plane_normal, __m128 sse_dot_plane, __m128* corner_a, __m128* corner_b, __m128* corner_c, __m128* corner_d) { __m128 dota = _mm_dp_ps(*corner_a, sse_plane_normal, 0x70 | 0x1); __m128 dotb = _mm_dp_ps(*corner_b, sse_plane_normal, 0x70 | 0x2); __m128 dotc = _mm_dp_ps(*corner_c, sse_plane_normal, 0x70 | 0x4); __m128 dotd = _mm_dp_ps(*corner_d, sse_plane_normal, 0x70 | 0x8); __m128 all_dots = _mm_add_ps(dota, dotb); all_dots = _mm_add_ps(all_dots, dotc); all_dots = _mm_add_ps(all_dots, dotd); __m128 intersection_test = _mm_sub_ps(all_dots, sse_dot_plane); __m128 zero = _mm_setzero_ps(); intersection_test = _mm_cmplt_ps(intersection_test, zero); return _mm_movemask_ps(intersection_test); }
/// Test whether this frustum intersects a given axis-aligned bounding box in world space. /// /// @param[in] rBox Box to test. /// /// @return True if the box intersects this frustum, false if not. bool Helium::Simd::Frustum::Intersects( const AaBox& rBox ) const { Helium::Simd::Register boxMinVec = rBox.GetMinimum().GetSimdVector(); Helium::Simd::Register boxMaxVec = rBox.GetMaximum().GetSimdVector(); Helium::Simd::Register boxX0 = _mm_shuffle_ps( boxMinVec, boxMinVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); Helium::Simd::Register boxX1 = _mm_shuffle_ps( boxMaxVec, boxMaxVec, _MM_SHUFFLE( 0, 0, 0, 0 ) ); Helium::Simd::Register boxY = _mm_shuffle_ps( boxMinVec, boxMaxVec, _MM_SHUFFLE( 1, 1, 1, 1 ) ); Helium::Simd::Register boxZ = _mm_unpackhi_ps( boxMinVec, boxMaxVec ); boxZ = _mm_movelh_ps( boxZ, boxZ ); PlaneSoa plane; Vector3Soa points( boxX0, boxY, boxZ ); Helium::Simd::Register zeroVec = Helium::Simd::LoadZeros(); size_t planeCount = ( m_bInfiniteFarClip ? PLANE_FAR : PLANE_MAX ); for( size_t planeIndex = 0; planeIndex < planeCount; ++planeIndex ) { plane.Load1Splat( m_planeA + planeIndex, m_planeB + planeIndex, m_planeC + planeIndex, m_planeD + planeIndex ); points.m_x = boxX0; Helium::Simd::Mask containsPoints0 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec ); points.m_x = boxX1; Helium::Simd::Mask containsPoints1 = Helium::Simd::GreaterEqualsF32( plane.GetDistance( points ), zeroVec ); int resultMask = _mm_movemask_ps( Helium::Simd::Or( containsPoints0, containsPoints1 ) ); if( resultMask == 0 ) { return false; } } return true; }
int nv_vector_eq(const nv_matrix_t *vec1, int j1, const nv_matrix_t *vec2, int j2) { NV_ASSERT(vec1->n == vec2->n); #if NV_ENABLE_SSE2 { __m128 xmm; int i = 0; int eq; int pk_lp = (vec1->n & 0xfffffffc); for (i = 0; i < pk_lp; i += 4) { xmm = _mm_load_ps(&NV_MAT_V(vec2, j2, i)); xmm = _mm_cmpneq_ps(xmm, *(const __m128 *)&NV_MAT_V(vec1, j1, i)); eq = _mm_movemask_ps(xmm); if (eq != 0) { return 0; } } for (i = pk_lp; i < vec1->n; ++i) { if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) { return 0; } } return 1; } #else { int i; for (i = 0; i < vec1->n; ++i) { if (NV_MAT_V(vec1, j1, i) != NV_MAT_V(vec2, j2, i)) { return 0; } } return 1; } #endif }
__m128 member_speed(__m128 cx_m , __m128 cy_m) { __m128 x = _mm_set1_ps(0.0f); __m128 y = _mm_set1_ps(0.0f); __m128 four_iter = _mm_set1_ps(0.0); __m128 temp_mask = _mm_set1_ps(0.0); __m128 mask = _mm_set1_ps(1.0); __m128 two_squared = _mm_set1_ps(4.0f); __m128 two = _mm_set1_ps(2.0f); __m128 x_sqr, y_sqr; x_sqr = _mm_mul_ps(x, x); y_sqr = _mm_mul_ps(y, y); int iterations = 0; while ( (_mm_movemask_ps( temp_mask = _mm_cmplt_ps(_mm_add_ps(x_sqr, y_sqr),two_squared)) != 0) && (iterations < MAX_ITS) ){ __m128 xtemp = _mm_add_ps(_mm_sub_ps(x_sqr, y_sqr), cx_m); y = _mm_add_ps(_mm_mul_ps(two, _mm_mul_ps(x, y)), cy_m); x = xtemp; x_sqr = _mm_mul_ps(x, x); y_sqr = _mm_mul_ps(y, y); iterations ++; four_iter = _mm_add_ps(four_iter, _mm_and_ps(temp_mask, mask)); } return four_iter; }
static inline void sacEvaluateModelSPRT(PROSAC_HEST* p){ unsigned i; unsigned isInlier; double lambda = 1.0; double lambdaReject = ((1.0 - p->delta) / (1.0 - p->epsilon)); double lambdaAccept = (( p->delta ) / ( p->epsilon )); float distSq = p->maxD*p->maxD; float* src = (float*)p->src; float* dst = (float*)p->dst; float* H = p->H; p->inl = 0; p->N_tested = 0; p->good = 1; /* VECTOR */ const __m128 distSqV=_mm_set1_ps(distSq); const __m128 H00=_mm_set1_ps(H[0]); const __m128 H01=_mm_set1_ps(H[1]); const __m128 H02=_mm_set1_ps(H[2]); const __m128 H10=_mm_set1_ps(H[4]); const __m128 H11=_mm_set1_ps(H[5]); const __m128 H12=_mm_set1_ps(H[6]); const __m128 H20=_mm_set1_ps(H[8]); const __m128 H21=_mm_set1_ps(H[9]); const __m128 H22=_mm_set1_ps(H[10]); for(i=0;i<(p->N-3) && p->good;i+=4){ /* Backproject */ __m128 x, y, X, Y, inter0, inter1, inter2, inter3; x=_mm_load_ps(src+2*i); y=_mm_load_ps(src+2*i+4); X=_mm_load_ps(dst+2*i); Y=_mm_load_ps(dst+2*i+4); inter0=_mm_unpacklo_ps(x,y);// y1 y0 x1 x0 inter1=_mm_unpackhi_ps(x,y);// y3 y2 x3 x2 inter2=_mm_unpacklo_ps(X,Y);// Y1 Y0 X1 X0 inter3=_mm_unpackhi_ps(X,Y);// Y3 Y2 X3 X2 x=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter0), _mm_castps_pd(inter1))); X=_mm_castpd_ps(_mm_unpacklo_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); Y=_mm_castpd_ps(_mm_unpackhi_pd(_mm_castps_pd(inter2), _mm_castps_pd(inter3))); __m128 reprojX = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H00, x), _mm_mul_ps(H01, y)), H02); __m128 reprojY = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H10, x), _mm_mul_ps(H11, y)), H12); __m128 reprojZ = _mm_add_ps(_mm_add_ps(_mm_mul_ps(H20, x), _mm_mul_ps(H21, y)), H22); __m128 recipZ = _mm_rcp_ps(reprojZ); reprojX = _mm_mul_ps(reprojX, recipZ); reprojY = _mm_mul_ps(reprojY, recipZ); //reprojX = _mm_div_ps(reprojX, reprojZ); //reprojY = _mm_div_ps(reprojY, reprojZ); reprojX = _mm_sub_ps(reprojX, X); reprojY = _mm_sub_ps(reprojY, Y); reprojX = _mm_mul_ps(reprojX, reprojX); reprojY = _mm_mul_ps(reprojY, reprojY); __m128 reprojDistV = _mm_add_ps(reprojX, reprojY); __m128 cmp = _mm_cmple_ps(reprojDistV, distSqV); int msk = _mm_movemask_ps(cmp); /* ... */ /* 0 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15*/ unsigned bitCnt[] = {0, 1, 1, 2, 1, 2, 2, 3, 1, 2, 2, 3, 2, 3, 3, 4}; p->inl += bitCnt[msk]; /* SPRT */ lambda *= p->lambdaTBL[msk]; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } /* SCALAR */ for(;i<p->N && p->good;i++){ /* Backproject */ float x=src[i*2],y=src[i*2+1]; float X=dst[i*2],Y=dst[i*2+1]; float reprojX=H[0]*x+H[1]*y+H[2]; // ( X_1 ) ( H_11 H_12 H_13 ) (x_1) float reprojY=H[4]*x+H[5]*y+H[6]; // ( X_2 ) = ( H_21 H_22 H_23 ) (x_2) float reprojZ=H[8]*x+H[9]*y+H[10];// ( X_3 ) ( H_31 H_32 H_33=1.0 ) (x_3 = 1.0) //reproj is in homogeneous coordinates. To bring back to "regular" coordinates, divide by Z. reprojX/=reprojZ; reprojY/=reprojZ; //Compute distance reprojX-=X; reprojY-=Y; reprojX*=reprojX; reprojY*=reprojY; float reprojDist = reprojX+reprojY; /* ... */ isInlier = reprojDist <= distSq; p->inl += isInlier; /* SPRT */ lambda *= isInlier ? lambdaAccept : lambdaReject; p->good = lambda <= p->A; /* If !p->good, the threshold A was exceeded, so we're rejecting */ } p->N_tested = i; }
static inline int sacIsSampleDegenerate(PROSAC_HEST* p){ unsigned i0 = p->smpl[0], i1 = p->smpl[1], i2 = p->smpl[2], i3 = p->smpl[3]; /** * Pack the matches selected by the SAC algorithm. * Must be packed points[0:7] = {srcx0, srcy0, srcx1, srcy1, srcx2, srcy2, srcx3, srcy3} * points[8:15] = {dstx0, dsty0, dstx1, dsty1, dstx2, dsty2, dstx3, dsty3} * Gather 4 points into the vector */ __m128 src10 = _mm_loadl_pi(src10, (__m64*)&p->src[i0]); src10 = _mm_loadh_pi(src10, (__m64*)&p->src[i1]); __m128 src32 = _mm_loadl_pi(src32, (__m64*)&p->src[i2]); src32 = _mm_loadh_pi(src32, (__m64*)&p->src[i3]); __m128 dst10 = _mm_loadl_pi(dst10, (__m64*)&p->dst[i0]); dst10 = _mm_loadh_pi(dst10, (__m64*)&p->dst[i1]); __m128 dst32 = _mm_loadl_pi(dst32, (__m64*)&p->dst[i2]); dst32 = _mm_loadh_pi(dst32, (__m64*)&p->dst[i3]); /** * If the matches' source points have common x and y coordinates, abort. */ /** * Check: * packedPoints[0].x == packedPoints[2].x * packedPoints[0].y == packedPoints[2].y * packedPoints[1].x == packedPoints[3].x * packedPoints[1].y == packedPoints[3].y */ __m128 chkEq0 = _mm_cmpeq_ps(src10, src32); /** * Check: * packedPoints[1].x == packedPoints[2].x * packedPoints[1].y == packedPoints[2].y * packedPoints[0].x == packedPoints[3].x * packedPoints[0].y == packedPoints[3].y */ __m128 chkEq1 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src10, _MM_SHUFFLE(1, 0, 3, 2)), src32); /** * Check: * packedPoints[0].x == packedPoints[1].x * packedPoints[0].y == packedPoints[1].y * packedPoints[2].x == packedPoints[3].x * packedPoints[2].y == packedPoints[3].y */ __m128 chkEq2 = _mm_cmpeq_ps(_mm_shuffle_ps(src10, src32, _MM_SHUFFLE(1, 0, 1, 0)), _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 2, 3, 2))); /* Verify */ if(_mm_movemask_ps(_mm_or_ps(chkEq0, _mm_or_ps(chkEq1, chkEq2)))){ return 1; } /* If the matches do not satisfy the strong geometric constraint, abort. */ /** * p6420x = (p6.x, p4.x, p2.x, p0.x) * p6420y = (p6.y, p4.y, p2.y, p0.y) * p7531x = (p7.x, p5.x, p3.x, p1.x) * p7531y = (p7.y, p5.y, p3.y, p1.y) * crosssd0 = p6420y - p7531y = (cross2d0, cross0d0, cross2s0, cross0s0) * crosssd1 = p7531x - p6420x = (cross2d1, cross0d1, cross2s1, cross0s1) * crosssd2 = p6420x * p7531y - p6420y * p7531x = (cross2d2, cross0d2, cross2s2, cross0s2) * * shufcrosssd0 = (cross0d0, cross2d0, cross0s0, cross2s0) * shufcrosssd1 = (cross0d1, cross2d1, cross0s1, cross2s1) * shufcrosssd2 = (cross0d2, cross2d2, cross0s2, cross2s2) * * dotsd0 = shufcrosssd0 * p6420x + * shufcrosssd1 * p6420y + * shufcrosssd2 * = (dotd0, dotd2, dots0, dots2) * dotsd1 = shufcrosssd0 * p7531x + * shufcrosssd1 * p7531y + * shufcrosssd2 * = (dotd1, dotd3, dots1, dots3) * * dots = shufps(dotsd0, dotsd1, _MM_SHUFFLE(1, 0, 1, 0)) * dotd = shufps(dotsd0, dotsd1, _MM_SHUFFLE(3, 2, 3, 2)) * movmaskps(dots ^ dotd) */ __m128 p3210x = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p3210y = _mm_shuffle_ps(src10, src32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7654x = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7654y = _mm_shuffle_ps(dst10, dst32, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p6420x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p6420y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(2, 0, 2, 0)); __m128 p7531x = _mm_shuffle_ps(p3210x, p7654x, _MM_SHUFFLE(3, 1, 3, 1)); __m128 p7531y = _mm_shuffle_ps(p3210y, p7654y, _MM_SHUFFLE(3, 1, 3, 1)); __m128 crosssd0 = _mm_sub_ps(p6420y, p7531y); __m128 crosssd1 = _mm_sub_ps(p7531x, p6420x); __m128 crosssd2 = _mm_sub_ps(_mm_mul_ps(p6420x, p7531y), _mm_mul_ps(p6420y, p7531x)); __m128 shufcrosssd0 = _mm_shuffle_ps(crosssd0, crosssd0, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd1 = _mm_shuffle_ps(crosssd1, crosssd1, _MM_SHUFFLE(2, 3, 0, 1)); __m128 shufcrosssd2 = _mm_shuffle_ps(crosssd2, crosssd2, _MM_SHUFFLE(2, 3, 0, 1)); __m128 dotsd0 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p6420x), _mm_mul_ps(shufcrosssd1, p6420y)), shufcrosssd2); __m128 dotsd1 = _mm_add_ps(_mm_add_ps(_mm_mul_ps(shufcrosssd0, p7531x), _mm_mul_ps(shufcrosssd1, p7531y)), shufcrosssd2); __m128 dots = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(0, 1, 0, 1)); __m128 dotd = _mm_shuffle_ps(dotsd0, dotsd1, _MM_SHUFFLE(2, 3, 2, 3)); //if(_mm_movemask_ps(_mm_cmpge_ps(_mm_setzero_ps(), _mm_mul_ps(dots, dotd)))){ if(_mm_movemask_epi8(_mm_cmplt_epi32(_mm_xor_si128(_mm_cvtps_epi32(dots), _mm_cvtps_epi32(dotd)), _mm_setzero_si128()))){ return 1; } /* Otherwise, proceed with evaluation */ _mm_store_ps((float*)&p->pkdPts[0], src10); _mm_store_ps((float*)&p->pkdPts[2], src32); _mm_store_ps((float*)&p->pkdPts[4], dst10); _mm_store_ps((float*)&p->pkdPts[6], dst32); return 0; }
static int forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc) { register __m128 mpv, dpv, ipv; /* previous row values */ register __m128 sv; /* temp storage of 1 curr row value in progress */ register __m128 dcv; /* delayed storage of D(i,q+1) */ register __m128 xEv; /* E state: keeps max for Mk->E as we go */ register __m128 xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ __m128 zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over quads 0..nq-1 */ int j; /* counter over DD iterations (4 is full serialization) */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ __m128 *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ __m128 *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ __m128 *rp; /* will point at om->rfv[x] for residue x[i] */ __m128 *tp; /* will point into (and step thru) om->tfv */ /* Initialization. */ ox->M = om->M; ox->L = L; ox->has_own_scales = TRUE; /* all forward matrices control their own scalefactors */ zerov = _mm_setzero_ps(); for (q = 0; q < Q; q++) MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov; xE = ox->xmx[p7X_E] = 0.; xN = ox->xmx[p7X_N] = 1.; xJ = ox->xmx[p7X_J] = 0.; xB = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE]; xC = ox->xmx[p7X_C] = 0.; ox->xmx[p7X_SCALE] = 1.0; ox->totscale = 0.0; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=0, width=8, precision=5*/ #endif for (i = 1; i <= L; i++) { dpp = dpc; dpc = ox->dpf[do_full * i]; /* avoid conditional, use do_full as kronecker delta */ rp = om->rfv[dsq[i]]; tp = om->tfv; dcv = _mm_setzero_ps(); xEv = _mm_setzero_ps(); xBv = _mm_set1_ps(xB); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. Shift zeros on. */ mpv = esl_sse_rightshift_ps(MMO(dpp,Q-1), zerov); dpv = esl_sse_rightshift_ps(DMO(dpp,Q-1), zerov); ipv = esl_sse_rightshift_ps(IMO(dpp,Q-1), zerov); for (q = 0; q < Q; q++) { /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */ sv = _mm_mul_ps(xBv, *tp); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(mpv, *tp)); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++; sv = _mm_add_ps(sv, _mm_mul_ps(dpv, *tp)); tp++; sv = _mm_mul_ps(sv, *rp); rp++; xEv = _mm_add_ps(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMO(dpc,q) = sv; DMO(dpc,q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = _mm_mul_ps(sv, *tp); tp++; /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */ sv = _mm_mul_ps(mpv, *tp); tp++; IMO(dpc,q) = _mm_add_ps(sv, _mm_mul_ps(ipv, *tp)); tp++; } /* Now the DD paths. We would rather not serialize them but * in an accurate Forward calculation, we have few options. */ /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ /* We're almost certainly're obligated to do at least one complete * DD path to be sure: */ dcv = esl_sse_rightshift_ps(dcv, zerov); DMO(dpc,0) = zerov; tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q)); dcv = _mm_mul_ps(DMO(dpc,q), *tp); tp++; /* extend DMO(q), so we include M->D and D->D paths */ } /* now. on small models, it seems best (empirically) to just go * ahead and serialize. on large models, we can do a bit better, * by testing for when dcv (DD path) accrued to DMO(q) is below * machine epsilon for all q, in which case we know DMO(q) are all * at their final values. The tradeoff point is (empirically) somewhere around M=100, * at least on my desktop. We don't worry about the conditional here; * it's outside any inner loops. */ if (om->M < 100) { /* Fully serialized version */ for (j = 1; j < 4; j++) { dcv = esl_sse_rightshift_ps(dcv, zerov); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { /* note, extend dcv, not DMO(q); only adding DD paths now */ DMO(dpc,q) = _mm_add_ps(dcv, DMO(dpc,q)); dcv = _mm_mul_ps(dcv, *tp); tp++; } } } else { /* Slightly parallelized version, but which incurs some overhead */ for (j = 1; j < 4; j++) { register __m128 cv; /* keeps track of whether any DD's change DMO(q) */ dcv = esl_sse_rightshift_ps(dcv, zerov); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ cv = zerov; for (q = 0; q < Q; q++) { /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */ sv = _mm_add_ps(dcv, DMO(dpc,q)); cv = _mm_or_ps(cv, _mm_cmpgt_ps(sv, DMO(dpc,q))); DMO(dpc,q) = sv; /* store new DMO(q) */ dcv = _mm_mul_ps(dcv, *tp); tp++; /* note, extend dcv, not DMO(q) */ } if (! _mm_movemask_ps(cv)) break; /* DD's didn't change any DMO(q)? Then done, break out. */ } } /* Add D's to xEv */ for (q = 0; q < Q; q++) xEv = _mm_add_ps(DMO(dpc,q), xEv); /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */ /* The following incantation is a horizontal sum of xEv's elements */ /* These must follow DD calculations, because D's contribute to E in Forward * (as opposed to Viterbi) */ xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(0, 3, 2, 1))); xEv = _mm_add_ps(xEv, _mm_shuffle_ps(xEv, xEv, _MM_SHUFFLE(1, 0, 3, 2))); _mm_store_ss(&xE, xEv); xN = xN * om->xf[p7O_N][p7O_LOOP]; xC = (xC * om->xf[p7O_C][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_MOVE]); xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_LOOP]); xB = (xJ * om->xf[p7O_J][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Sparse rescaling. xE above threshold? trigger a rescaling event. */ if (xE > 1.0e4) /* that's a little less than e^10, ~10% of our dynamic range */ { xN = xN / xE; xC = xC / xE; xJ = xJ / xE; xB = xB / xE; xEv = _mm_set1_ps(1.0 / xE); for (q = 0; q < Q; q++) { MMO(dpc,q) = _mm_mul_ps(MMO(dpc,q), xEv); DMO(dpc,q) = _mm_mul_ps(DMO(dpc,q), xEv); IMO(dpc,q) = _mm_mul_ps(IMO(dpc,q), xEv); } ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE; ox->totscale += log(xE); xE = 1.0; } else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0; /* Storage of the specials. We could've stored these already * but using xE, etc. variables makes it easy to convert this * code to O(M) memory versions just by deleting storage steps. */ ox->xmx[i*p7X_NXCELLS+p7X_E] = xE; ox->xmx[i*p7X_NXCELLS+p7X_N] = xN; ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ; ox->xmx[i*p7X_NXCELLS+p7X_B] = xB; ox->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=8, precision=5*/ #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and flip total score back to log space (nats) */ /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */ /* On an underflow (which shouldn't happen), we counterintuitively return infinity: * the effect of this is to force the caller to rescore us with full range. */ if (isnan(xC)) ESL_EXCEPTION(eslERANGE, "forward score is NaN"); else if (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)"); /* if L==0, xC *should* be 0.0; J5/118 */ else if (isinf(xC) == 1) ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]); return eslOK; }
// -------------------------------------------------------------- vuint32 mandelbrot_SIMD_F32(vfloat32 a, vfloat32 b, int max_iter) // -------------------------------------------------------------- { // version avec test de sortie en float vuint32 iter = _mm_set1_epi32(0); vfloat32 fiter = _mm_set_ps(0,0,0,0); vfloat32 x,y,t,t2,zero,un,deux,quatre; // COMPLETER ICI int test,i = 0; // initialisation des variables x = _mm_set_ps(0,0,0,0); y = _mm_set_ps(0,0,0,0); deux = _mm_set_ps(2,2,2,2); quatre = _mm_set_ps(4,4,4,4); un = _mm_set_ps(1,1,1,1); zero = _mm_set_ps(0,0,0,0); // iteration zero t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(x,y); y = _mm_mul_ps(y,deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); // calcul while(i<max_iter && _mm_movemask_ps(t) != 15){ t = _mm_mul_ps(x, x); t2 = _mm_mul_ps(y, y); y = _mm_mul_ps(_mm_mul_ps(x,y),deux); y = _mm_add_ps(y,b); x = _mm_sub_ps(t,t2); x = _mm_add_ps(x,a); t2 = _mm_add_ps(t,t2); t2 = _mm_cmple_ps(t2,quatre); t = _mm_blendv_ps(zero,un,t2); fiter = _mm_add_ps(fiter,t); t = _mm_cmpeq_ps(t, zero); //display_vfloat32(t,"%f\t","T :: "); //printf(" MASK::%d \n",_mm_movemask_ps(t)); i+=1; } iter = _mm_cvtps_epi32(fiter); return iter; }
//Returns true if the triangle is visible bool DepthBuffer::testTriangle2x2(const vec4f& v0,const vec4f& v1,const vec4f& v2){ VecS32 colOffset(0, 1, 0, 1); VecS32 rowOffset(0, 0, 1, 1); vec2i vertex[3]; vertex[0] = vec2i(int32(v0.x),int32(v0.y)); vertex[1] = vec2i(int32(v1.x),int32(v1.y)); vertex[2] = vec2i(int32(v2.x),int32(v2.y)); // Reject the triangle if any of its verts is behind the nearclip plane if(v0.w == 0.0f || v1.w == 0.0f || v2.w == 0.0f) return true; float minZ = std::min(v0.z,std::min(v1.z,v2.z)); VecF32 fixedDepth(minZ); // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle auto A0 = vertex[1].y - vertex[2].y; auto A1 = vertex[2].y - vertex[0].y; auto A2 = vertex[0].y - vertex[1].y; // Compute B = (xb - xa) for the 3 line segments that make up each triangle auto B0 = vertex[2].x - vertex[1].x; auto B1 = vertex[0].x - vertex[2].x; auto B2 = vertex[1].x - vertex[0].x; // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle auto C0 = vertex[1].x * vertex[2].y - vertex[2].x * vertex[1].y; auto C1 = vertex[2].x * vertex[0].y - vertex[0].x * vertex[2].y; auto C2 = vertex[0].x * vertex[1].y - vertex[1].x * vertex[0].y; // Use bounding box traversal strategy to determine which pixels to rasterize auto minx = std::max(std::min(std::min(vertex[0].x,vertex[1].x),vertex[2].x),0) & (~1); auto maxx = std::min(std::max(std::max(vertex[0].x,vertex[1].x),vertex[2].x),size_.x-2); auto miny = std::max(std::min(std::min(vertex[0].y,vertex[1].y),vertex[2].y),0) & (~1); auto maxy = std::min(std::max(std::max(vertex[0].y,vertex[1].y),vertex[2].y),size_.y-2); VecS32 a0(A0); VecS32 a1(A1); VecS32 a2(A2); VecS32 b0(B0); VecS32 b1(B1); VecS32 b2(B2); VecS32 col = VecS32(minx) + colOffset; VecS32 row = VecS32(miny) + rowOffset; auto rowIdx = miny*size_.x + 2 * minx; VecS32 w0_row = a0 * col + b0 * row + VecS32(C0); VecS32 w1_row = a1 * col + b1 * row + VecS32(C1); VecS32 w2_row = a2 * col + b2 * row + VecS32(C2); //Multiply each weight by two(rasterize 2x2 quad at once). a0 = shiftl<1>(a0); a1 = shiftl<1>(a1); a2 = shiftl<1>(a2); b0 = shiftl<1>(b0); b1 = shiftl<1>(b1); b2 = shiftl<1>(b2); for(int32 y = miny;y<=maxy;y+=2,rowIdx += 2 * size_.x){ auto w0 = w0_row; auto w1 = w1_row; auto w2 = w2_row; auto idx = rowIdx; for(int32 x = minx;x<=maxx;x+=2,idx+=4){ auto mask = w0|w1|w2; auto masks = _mm_movemask_ps(bits2float(mask).simd); if(masks != 0xF){ VecF32 previousDepth = VecF32::load(data_+idx); auto cmpMask = ((~masks)&0xF)& _mm_movemask_ps(cmple(fixedDepth,previousDepth).simd); if(cmpMask){ return true; } } w0+=a0; w1+=a1; w2+=a2; } w0_row += b0; w1_row += b1; w2_row += b2; } return false; }
inline bool compare_byFloatSSE(const char * p1, const char * p2) { return !_mm_movemask_ps(_mm_cmpneq_ps( /// Кажется, некорректно при сравнении субнормальных float-ов. _mm_loadu_ps(reinterpret_cast<const float *>(p1)), _mm_loadu_ps(reinterpret_cast<const float *>(p2)))); }
CPLErr GDALGridInverseDistanceToAPower2NoSmoothingNoSearchSSE( const void *poOptions, GUInt32 nPoints, CPL_UNUSED const double *unused_padfX, CPL_UNUSED const double *unused_padfY, CPL_UNUSED const double *unused_padfZ, double dfXPoint, double dfYPoint, double *pdfValue, void* hExtraParamsIn ) { size_t i = 0; GDALGridExtraParameters* psExtraParams = (GDALGridExtraParameters*) hExtraParamsIn; const float* pafX = psExtraParams->pafX; const float* pafY = psExtraParams->pafY; const float* pafZ = psExtraParams->pafZ; const float fEpsilon = 0.0000000000001f; const float fXPoint = (float)dfXPoint; const float fYPoint = (float)dfYPoint; const __m128 xmm_small = _mm_load1_ps((float*)&fEpsilon); const __m128 xmm_x = _mm_load1_ps((float*)&fXPoint); const __m128 xmm_y = _mm_load1_ps((float*)&fYPoint); __m128 xmm_nominator = _mm_setzero_ps(); __m128 xmm_denominator = _mm_setzero_ps(); int mask = 0; #if defined(__x86_64) || defined(_M_X64) /* This would also work in 32bit mode, but there are only 8 XMM registers */ /* whereas we have 16 for 64bit */ #define LOOP_SIZE 8 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m128 xmm_rx = _mm_sub_ps(_mm_load_ps(pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */ __m128 xmm_rx_4 = _mm_sub_ps(_mm_load_ps(pafX + i + 4), xmm_x); __m128 xmm_ry = _mm_sub_ps(_mm_load_ps(pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */ __m128 xmm_ry_4 = _mm_sub_ps(_mm_load_ps(pafY + i + 4), xmm_y); __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ _mm_mul_ps(xmm_ry, xmm_ry)); __m128 xmm_r2_4 = _mm_add_ps(_mm_mul_ps(xmm_rx_4, xmm_rx_4), _mm_mul_ps(xmm_ry_4, xmm_ry_4)); __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ __m128 xmm_invr2_4 = _mm_rcp_ps(xmm_r2_4); xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ _mm_mul_ps(xmm_invr2, _mm_load_ps(pafZ + i))); xmm_nominator = _mm_add_ps(xmm_nominator, _mm_mul_ps(xmm_invr2_4, _mm_load_ps(pafZ + i + 4))); xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2_4); mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)) | /* if( r2 < fEpsilon) */ (_mm_movemask_ps(_mm_cmplt_ps(xmm_r2_4, xmm_small)) << 4); if( mask ) break; } #else #define LOOP_SIZE 4 size_t nPointsRound = (nPoints / LOOP_SIZE) * LOOP_SIZE; for ( i = 0; i < nPointsRound; i += LOOP_SIZE ) { __m128 xmm_rx = _mm_sub_ps(_mm_load_ps((float*)pafX + i), xmm_x); /* rx = pafX[i] - fXPoint */ __m128 xmm_ry = _mm_sub_ps(_mm_load_ps((float*)pafY + i), xmm_y); /* ry = pafY[i] - fYPoint */ __m128 xmm_r2 = _mm_add_ps(_mm_mul_ps(xmm_rx, xmm_rx), /* r2 = rx * rx + ry * ry */ _mm_mul_ps(xmm_ry, xmm_ry)); __m128 xmm_invr2 = _mm_rcp_ps(xmm_r2); /* invr2 = 1.0f / r2 */ xmm_nominator = _mm_add_ps(xmm_nominator, /* nominator += invr2 * pafZ[i] */ _mm_mul_ps(xmm_invr2, _mm_load_ps((float*)pafZ + i))); xmm_denominator = _mm_add_ps(xmm_denominator, xmm_invr2); /* denominator += invr2 */ mask = _mm_movemask_ps(_mm_cmplt_ps(xmm_r2, xmm_small)); /* if( r2 < fEpsilon) */ if( mask ) break; } #endif /* Find which i triggered r2 < fEpsilon */ if( mask ) { for(int j = 0; j < LOOP_SIZE; j++ ) { if( mask & (1 << j) ) { (*pdfValue) = (pafZ)[i + j]; return CE_None; } } } /* Get back nominator and denominator values for XMM registers */ float afNominator[4], afDenominator[4]; _mm_storeu_ps(afNominator, xmm_nominator); _mm_storeu_ps(afDenominator, xmm_denominator); float fNominator = afNominator[0] + afNominator[1] + afNominator[2] + afNominator[3]; float fDenominator = afDenominator[0] + afDenominator[1] + afDenominator[2] + afDenominator[3]; /* Do the few remaining loop iterations */ for ( ; i < nPoints; i++ ) { const float fRX = pafX[i] - fXPoint; const float fRY = pafY[i] - fYPoint; const float fR2 = fRX * fRX + fRY * fRY; // If the test point is close to the grid node, use the point // value directly as a node value to avoid singularity. if ( fR2 < 0.0000000000001 ) { break; } else { const float fInvR2 = 1.0f / fR2; fNominator += fInvR2 * pafZ[i]; fDenominator += fInvR2; } } if( i != nPoints ) { (*pdfValue) = pafZ[i]; } else if ( fDenominator == 0.0 ) { (*pdfValue) = ((GDALGridInverseDistanceToAPowerOptions*)poOptions)->dfNoDataValue; } else (*pdfValue) = fNominator / fDenominator; return CE_None; }
bool DepthBuffer::testAABB(vec3f min,vec3f max){ vec4f vertices[8]; gatherBoxVertices(vertices,min,max); transformBoxVertices(vertices,viewProjection_); boxVerticesToScreenVertices(vertices,clipSpaceToScreenSpaceMultiplier,center_); ScreenSpaceQuad faces[6]; auto faceCount = extractBoxQuads(faces,vertices,aabbFrontFaceMask); for(uint32 i = 0;i<faceCount;++i){ if(testTriangle2x2(faces[i].v[0],faces[i].v[1],faces[i].v[2])) return true; if(testTriangle2x2(faces[i].v[2],faces[i].v[3],faces[i].v[0])) return true; } return false; //Transform the AABB vertices to Homogenous clip space //vec4f vertices[8]; vertices[0] = vec4f(min.x,min.y,min.z,1); vertices[1] = vec4f(max.x,min.y,min.z,1); vertices[2] = vec4f(max.x,max.y,min.z,1); vertices[3] = vec4f(min.x,max.y,min.z,1); vertices[4] = vec4f(min.x,min.y,max.z,1); vertices[5] = vec4f(max.x,min.y,max.z,1); vertices[6] = vec4f(max.x,max.y,max.z,1); vertices[7] = vec4f(min.x,max.y,max.z,1); vec4f clipMin,clipMax; for(uint32 i =0;i<8;++i){ vertices[i] = viewProjection_ * vertices[i]; vertices[i] = vertices[i] * (1.0f/vertices[i].w); //Homogenous coordinates => non-homogenous coordinates //Determine the min/max for the screen aabb clipMin = vec4f::min(vertices[i],clipMin); clipMax = vec4f::max(vertices[i],clipMax); } //Determine the screen aabb which covers the box //Clip space coordinates => screen coordinates [-1,1] -> [-320,320] -> [0,640] clipMin = vec4f::max(clipMin,vec4f(-1.0f,-1.0f,-1.0f,-1.0f))*clipSpaceToScreenSpaceMultiplier; clipMax = vec4f::min(clipMax,vec4f(1.0f,1.0f,1.0f,1.0f))*clipSpaceToScreenSpaceMultiplier; vec2i screenMin = vec2i(int32(clipMin.x),int32(clipMin.y))+center_; screenMin.x &= (~1);screenMin.y &= (~1); vec2i screenMax = vec2i(int32(clipMax.x),int32(clipMax.y))+center_; auto rowIdx = screenMin.y*size_.x + 2 * screenMin.x; float minZ = vertices[0].z; for(uint32 i = 1;i< 8;i++){ minZ = std::min(minZ,vertices[i].z); } VecF32 flatDepth(minZ); //Iterate over the pixels for(;screenMin.y < screenMax.y;screenMin.y+=2,rowIdx += 2 * size_.x){ auto idx = rowIdx; for(int32 x = screenMin.x;x<screenMax.x;x+=2,idx+=4){ //Fetch the distance value for the current pixel. auto depth = VecF32::load(data_ + idx); vec3f rayo,rayd; getRay(rayo,rayd,x,screenMin.y); float dist; if(!rayAABBIntersect(min,max,rayo,rayd,dist)) continue; dist = ((dist-znear_)/zfar_ ) * 2.0f - 1.0f; VecF32 flatDepth(dist); flatDepth.store(data_+idx); auto mask = _mm_movemask_ps(cmple(flatDepth,depth).simd); //if(mask != 0)//{ //return true; //Visible //} // // //Compute the distance to the aabb (raytrace) //vec3f rayo,rayd; //getRay(rayo,rayd,x,screenMin.y); //float dist; //Convert the distance from view space to depth space [-1,1] //dist = ((dist-znear_)/zfar_ ) * 2.0f - 1.0f; //Compare the values. //if(dist <= depth){ //return true; // data_[idx] = dist; //} } } return false; }
inline unsigned int to_bitmask(const float4& a) { return _mm_movemask_ps(a.data); }
KFR_SINTRIN bool bittestany(const f32sse& x) { return _mm_movemask_ps(*x); }
KFR_SINTRIN bool bittestall(const f32sse& x) { return !_mm_movemask_ps(*~x); }
void PreviewWorker::processCoherent(const WorkUnit *workUnit, WorkResult *workResult, const bool &stop) { #if defined(MTS_HAS_COHERENT_RT) const RectangularWorkUnit *rect = static_cast<const RectangularWorkUnit *>(workUnit); ImageBlock *block = static_cast<ImageBlock *>(workResult); block->setOffset(rect->getOffset()); block->setSize(rect->getSize()); /* Some constants */ const int sx = rect->getOffset().x, sy = block->getOffset().y; const int ex = sx + rect->getSize().x, ey = sy + rect->getSize().y; const int width = rect->getSize().x; const SSEVector MM_ALIGN16 xOffset(0.0f, 1.0f, 0.0f, 1.0f); const SSEVector MM_ALIGN16 yOffset(0.0f, 0.0f, 1.0f, 1.0f); const int pixelOffset[] = {0, 1, width, width+1}; const __m128 clamping = _mm_set1_ps(1/(m_minDist*m_minDist)); uint8_t temp[MTS_KD_INTERSECTION_TEMP*4]; const __m128 camTL[3] = { _mm_set1_ps(m_cameraTL.x), _mm_set1_ps(m_cameraTL.y), _mm_set1_ps(m_cameraTL.z) }; const __m128 camDx[3] = { _mm_set1_ps(m_cameraDx.x), _mm_set1_ps(m_cameraDx.y), _mm_set1_ps(m_cameraDx.z) }; const __m128 camDy[3] = { _mm_set1_ps(m_cameraDy.x), _mm_set1_ps(m_cameraDy.y), _mm_set1_ps(m_cameraDy.z) }; const __m128 lumPos[3] = { _mm_set1_ps(m_vpl.its.p.x), _mm_set1_ps(m_vpl.its.p.y), _mm_set1_ps(m_vpl.its.p.z) }; const __m128 lumDir[3] = { _mm_set1_ps(m_vpl.its.shFrame.n.x), _mm_set1_ps(m_vpl.its.shFrame.n.y), _mm_set1_ps(m_vpl.its.shFrame.n.z) }; /* Some local variables */ int pos = 0; int numRays = 0; RayPacket4 MM_ALIGN16 primRay4, secRay4; Intersection4 MM_ALIGN16 its4, secIts4; RayInterval4 MM_ALIGN16 itv4, secItv4; SSEVector MM_ALIGN16 nSecD[3], cosThetaLight, invLengthSquared; Spectrum emitted[4], direct[4]; Intersection its; Vector wo, wi; its.hasUVPartials = false; bool diffuseVPL = false, vplOnSurface = false; Spectrum vplWeight; if (m_vpl.type == ESurfaceVPL && (m_diffuseSources || m_vpl.its.shape->getBSDF()->getType() == BSDF::EDiffuseReflection)) { diffuseVPL = true; vplOnSurface = true; vplWeight = m_vpl.its.shape->getBSDF()->getDiffuseReflectance(m_vpl.its) * m_vpl.P / M_PI; } else if (m_vpl.type == ELuminaireVPL) { vplOnSurface = m_vpl.luminaire->getType() & Luminaire::EOnSurface; diffuseVPL = m_vpl.luminaire->getType() & Luminaire::EDiffuseDirection; EmissionRecord eRec(m_vpl.luminaire, ShapeSamplingRecord(m_vpl.its.p, m_vpl.its.shFrame.n), m_vpl.its.shFrame.n); vplWeight = m_vpl.P * m_vpl.luminaire->evalDirection(eRec); } primRay4.o[0].ps = _mm_set1_ps(m_cameraO.x); primRay4.o[1].ps = _mm_set1_ps(m_cameraO.y); primRay4.o[2].ps = _mm_set1_ps(m_cameraO.z); secItv4.mint.ps = _mm_set1_ps(ShadowEpsilon); /* Work on 2x2 sub-blocks */ for (int y=sy; y<ey; y += 2, pos += width) { for (int x=sx; x<ex; x += 2, pos += 2) { /* Generate camera rays without normalization */ const __m128 xPixel = _mm_add_ps(xOffset.ps, _mm_set1_ps((float) x)), yPixel = _mm_add_ps(yOffset.ps, _mm_set1_ps((float) y)); primRay4.d[0].ps = _mm_add_ps(camTL[0], _mm_add_ps( _mm_mul_ps(xPixel, camDx[0]), _mm_mul_ps(yPixel, camDy[0]))); primRay4.d[1].ps = _mm_add_ps(camTL[1], _mm_add_ps( _mm_mul_ps(xPixel, camDx[1]), _mm_mul_ps(yPixel, camDy[1]))); primRay4.d[2].ps = _mm_add_ps(camTL[2], _mm_add_ps( _mm_mul_ps(xPixel, camDx[2]), _mm_mul_ps(yPixel, camDy[2]))); primRay4.dRcp[0].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[0].ps); primRay4.dRcp[1].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[1].ps); primRay4.dRcp[2].ps = _mm_div_ps(SSEConstants::one.ps, primRay4.d[2].ps); /* Ray coherence test */ const int primSignsX = _mm_movemask_ps(primRay4.d[0].ps); const int primSignsY = _mm_movemask_ps(primRay4.d[1].ps); const int primSignsZ = _mm_movemask_ps(primRay4.d[2].ps); const bool primCoherent = (primSignsX == 0 || primSignsX == 0xF) && (primSignsY == 0 || primSignsY == 0xF) && (primSignsZ == 0 || primSignsZ == 0xF); /* Trace the primary rays */ its4.t = SSEConstants::p_inf; if (EXPECT_TAKEN(primCoherent)) { primRay4.signs[0][0] = primSignsX ? 1 : 0; primRay4.signs[1][0] = primSignsY ? 1 : 0; primRay4.signs[2][0] = primSignsZ ? 1 : 0; m_kdtree->rayIntersectPacket(primRay4, itv4, its4, temp); } else { m_kdtree->rayIntersectPacketIncoherent(primRay4, itv4, its4, temp); } numRays += 4; /* Generate secondary rays */ secRay4.o[0].ps = _mm_add_ps(primRay4.o[0].ps, _mm_mul_ps(its4.t.ps, primRay4.d[0].ps)); secRay4.o[1].ps = _mm_add_ps(primRay4.o[1].ps, _mm_mul_ps(its4.t.ps, primRay4.d[1].ps)); secRay4.o[2].ps = _mm_add_ps(primRay4.o[2].ps, _mm_mul_ps(its4.t.ps, primRay4.d[2].ps)); secRay4.d[0].ps = _mm_sub_ps(lumPos[0], secRay4.o[0].ps); secRay4.d[1].ps = _mm_sub_ps(lumPos[1], secRay4.o[1].ps); secRay4.d[2].ps = _mm_sub_ps(lumPos[2], secRay4.o[2].ps); /* Normalization */ const __m128 lengthSquared = _mm_add_ps(_mm_add_ps( _mm_mul_ps(secRay4.d[0].ps, secRay4.d[0].ps), _mm_mul_ps(secRay4.d[1].ps, secRay4.d[1].ps)), _mm_mul_ps(secRay4.d[2].ps, secRay4.d[2].ps)), invLength = _mm_rsqrt_ps(lengthSquared); invLengthSquared.ps = _mm_min_ps(_mm_rcp_ps(lengthSquared), clamping); nSecD[0].ps = _mm_mul_ps(secRay4.d[0].ps, invLength); nSecD[1].ps = _mm_mul_ps(secRay4.d[1].ps, invLength); nSecD[2].ps = _mm_mul_ps(secRay4.d[2].ps, invLength); secRay4.dRcp[0].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[0].ps); secRay4.dRcp[1].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[1].ps); secRay4.dRcp[2].ps = _mm_div_ps(SSEConstants::one.ps, secRay4.d[2].ps); cosThetaLight.ps = _mm_sub_ps(_mm_setzero_ps(), _mm_add_ps(_mm_add_ps( _mm_mul_ps(nSecD[0].ps, lumDir[0]), _mm_mul_ps(nSecD[1].ps, lumDir[1])), _mm_mul_ps(nSecD[2].ps, lumDir[2]))); secItv4.maxt.ps = _mm_set1_ps(1-ShadowEpsilon); /* Shading (scalar) --- this is way too much work and should be rewritten to be smarter in special cases */ for (int idx=0; idx<4; ++idx) { if (EXPECT_NOT_TAKEN(its4.t.f[idx] == std::numeric_limits<float>::infinity())) { /* Don't trace a secondary ray */ secItv4.maxt.f[idx] = 0; emitted[idx] = m_scene->LeBackground(Ray( Point(primRay4.o[0].f[idx], primRay4.o[1].f[idx], primRay4.o[2].f[idx]), Vector(primRay4.d[0].f[idx], primRay4.d[1].f[idx], primRay4.d[2].f[idx]), 0.0f )) * m_backgroundScale; memset(&direct[idx], 0, sizeof(Spectrum)); continue; } const unsigned int primIndex = its4.primIndex.i[idx]; const Shape *shape = (*m_shapes)[its4.shapeIndex.i[idx]]; const BSDF *bsdf = shape->getBSDF(); if (EXPECT_NOT_TAKEN(!bsdf)) { memset(&emitted[idx], 0, sizeof(Spectrum)); memset(&direct[idx], 0, sizeof(Spectrum)); continue; } if (EXPECT_TAKEN(primIndex != KNoTriangleFlag)) { const TriMesh *mesh = static_cast<const TriMesh *>(shape); const Triangle &t = mesh->getTriangles()[primIndex]; const Normal *normals = mesh->getVertexNormals(); const Point2 *texcoords = mesh->getVertexTexcoords(); const Spectrum *colors = mesh->getVertexColors(); const TangentSpace * tangents = mesh->getVertexTangents(); const Float beta = its4.u.f[idx], gamma = its4.v.f[idx], alpha = 1.0f - beta - gamma; const uint32_t idx0 = t.idx[0], idx1 = t.idx[1], idx2 = t.idx[2]; if (EXPECT_TAKEN(normals)) { const Normal &n0 = normals[idx0], &n1 = normals[idx1], &n2 = normals[idx2]; its.shFrame.n = normalize(n0 * alpha + n1 * beta + n2 * gamma); } else { const Point *positions = mesh->getVertexPositions(); const Point &p0 = positions[idx0], &p1 = positions[idx1], &p2 = positions[idx2]; Vector sideA = p1 - p0, sideB = p2 - p0; Vector n = cross(sideA, sideB); Float nLengthSqr = n.lengthSquared(); if (nLengthSqr != 0) n /= std::sqrt(nLengthSqr); its.shFrame.n = Normal(n); } if (EXPECT_TAKEN(texcoords)) { const Point2 &t0 = texcoords[idx0], &t1 = texcoords[idx1], &t2 = texcoords[idx2]; its.uv = t0 * alpha + t1 * beta + t2 * gamma; } else { its.uv = Point2(0.0f); } if (EXPECT_NOT_TAKEN(colors)) { const Spectrum &c0 = colors[idx0], &c1 = colors[idx1], &c2 = colors[idx2]; its.color = c0 * alpha + c1 * beta + c2 * gamma; } if (EXPECT_NOT_TAKEN(tangents)) { const TangentSpace &t0 = tangents[idx0], &t1 = tangents[idx1], &t2 = tangents[idx2]; its.dpdu = t0.dpdu * alpha + t1.dpdu * beta + t2.dpdu * gamma; its.dpdv = t0.dpdv * alpha + t1.dpdv * beta + t2.dpdv * gamma; } } else { Ray ray( Point(primRay4.o[0].f[idx], primRay4.o[1].f[idx], primRay4.o[2].f[idx]), Vector(primRay4.d[0].f[idx], primRay4.d[1].f[idx], primRay4.d[2].f[idx]), 0.0f ); its.t = its4.t.f[idx]; shape->fillIntersectionRecord(ray, temp + idx * MTS_KD_INTERSECTION_TEMP + 8, its); bsdf = its.shape->getBSDF(); } wo.x = nSecD[0].f[idx]; wo.y = nSecD[1].f[idx]; wo.z = nSecD[2].f[idx]; if (EXPECT_TAKEN(!shape->isLuminaire())) { memset(&emitted[idx], 0, sizeof(Spectrum)); } else { Vector d(-primRay4.d[0].f[idx], -primRay4.d[1].f[idx], -primRay4.d[2].f[idx]); emitted[idx] = shape->getLuminaire()->Le(ShapeSamplingRecord(its.p, its.shFrame.n), d); } if (EXPECT_TAKEN(bsdf->getType() == BSDF::EDiffuseReflection && diffuseVPL)) { /* Fast path */ direct[idx] = (bsdf->getDiffuseReflectance(its) * vplWeight) * (std::max((Float) 0.0f, dot(wo, its.shFrame.n)) * (vplOnSurface ? (std::max(cosThetaLight.f[idx], (Float) 0.0f) * INV_PI) : INV_PI) * invLengthSquared.f[idx]); } else { wi.x = -primRay4.d[0].f[idx]; wi.y = -primRay4.d[1].f[idx]; wi.z = -primRay4.d[2].f[idx]; its.p.x = secRay4.o[0].f[idx]; its.p.y = secRay4.o[1].f[idx]; its.p.z = secRay4.o[2].f[idx]; if (EXPECT_NOT_TAKEN(bsdf->getType() & BSDF::EAnisotropic)) { its.shFrame.s = normalize(its.dpdu - its.shFrame.n * dot(its.shFrame.n, its.dpdu)); its.shFrame.t = cross(its.shFrame.n, its.shFrame.s); } else { coordinateSystem(its.shFrame.n, its.shFrame.s, its.shFrame.t); } const Float ctLight = cosThetaLight.f[idx]; wi = normalize(wi); its.wi = its.toLocal(wi); wo = its.toLocal(wo); if (!diffuseVPL) { if (m_vpl.type == ESurfaceVPL) { BSDFQueryRecord bRec(m_vpl.its, m_vpl.its.toLocal(wi)); bRec.quantity = EImportance; vplWeight = m_vpl.its.shape->getBSDF()->eval(bRec) * m_vpl.P; } else { EmissionRecord eRec(m_vpl.luminaire, ShapeSamplingRecord(m_vpl.its.p, m_vpl.its.shFrame.n), wi); eRec.type = EmissionRecord::EPreview; vplWeight = m_vpl.luminaire->evalDirection(eRec) * m_vpl.P; } } if (EXPECT_TAKEN(ctLight >= 0)) { direct[idx] = (bsdf->eval(BSDFQueryRecord(its, wo)) * vplWeight * ((vplOnSurface ? std::max(ctLight, (Float) 0.0f) : 1.0f) * invLengthSquared.f[idx])); } else { memset(&direct[idx], 0, sizeof(Spectrum)); } } ++numRays; } /* Shoot the secondary rays */ const int secSignsX = _mm_movemask_ps(secRay4.d[0].ps); const int secSignsY = _mm_movemask_ps(secRay4.d[1].ps); const int secSignsZ = _mm_movemask_ps(secRay4.d[2].ps); const bool secCoherent = (secSignsX == 0 || secSignsX == 0xF) && (secSignsY == 0 || secSignsY == 0xF) && (secSignsZ == 0 || secSignsZ == 0xF); /* Shoot the secondary rays */ secIts4.t = SSEConstants::p_inf; if (EXPECT_TAKEN(secCoherent)) { secRay4.signs[0][0] = secSignsX ? 1 : 0; secRay4.signs[1][0] = secSignsY ? 1 : 0; secRay4.signs[2][0] = secSignsZ ? 1 : 0; m_kdtree->rayIntersectPacket(secRay4, secItv4, secIts4, temp); } else { m_kdtree->rayIntersectPacketIncoherent(secRay4, secItv4, secIts4, temp); } for (int idx=0; idx<4; ++idx) { if (EXPECT_TAKEN(secIts4.t.f[idx] == std::numeric_limits<float>::infinity())) block->setPixel(pos+pixelOffset[idx], direct[idx]+emitted[idx]); else block->setPixel(pos+pixelOffset[idx], emitted[idx]); } } } block->setExtra(numRays); #else Log(EError, "Coherent raytracing support was not compiled into this binary!"); #endif }