/* SIMD selection */ __SIMD _SIMD_sel_ps(__SIMD a, __SIMD b, void** resultPtr) { #ifdef USE_SSE __SIMD* result = (__SIMD*) (*resultPtr); return _mm_or_ps(_mm_andnot_ps(*result,a),_mm_and_ps(*result,b)); #elif defined USE_AVX __SIMD* result = (__SIMD*) resultPtr; return _mm256_or_ps(_mm256_andnot_ps(*result,a),_mm256_and_ps(*result,b)); #elif defined USE_IBM return vec_sel(a,b,c); #endif }
//this method is untested as of right now.... inline void set_union(Set<bitset> *A_in, Set<bitset> *B_in){ if(A_in->number_of_bytes > 0 && B_in->number_of_bytes > 0){ const uint64_t *a_index = (uint64_t*) A_in->data; const uint64_t *b_index = (uint64_t*) B_in->data; uint64_t* A = (uint64_t*)(A_in->data+sizeof(uint64_t)); uint64_t* B = (uint64_t*)(B_in->data+sizeof(uint64_t)); const size_t s_a = ((A_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t)); const size_t s_b = ((B_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t)); const bool a_big = a_index[0] > b_index[0]; assert(a_index[0] <= b_index[0]); const uint64_t start_index = (a_big) ? a_index[0] : b_index[0]; const uint64_t a_start_index = (a_big) ? 0:(b_index[0]-a_index[0]); const uint64_t b_start_index = (a_big) ? (a_index[0]-b_index[0]):0; const uint64_t end_index = ((a_index[0]+s_a) > (b_index[0]+s_b)) ? (b_index[0]+s_b):(a_index[0]+s_a); const uint64_t total_size = (start_index > end_index) ? 0:(end_index-start_index); //16 uint16_ts //8 ints //4 longs size_t i = 0; A += a_start_index; B += b_start_index; #if VECTORIZE == 1 for(; (i+3) < total_size; i += 4, A += 4, B += 4){ const __m256 a1 = _mm256_loadu_ps((const float*)A); const __m256 a2 = _mm256_loadu_ps((const float*)B); const __m256 r = _mm256_or_ps(a2, a1); _mm256_storeu_ps((float*)A, r); } #endif for(; i < total_size; i++, A++, B++){ *A |= *B; } } }
inline vec8 operator|(vec8 a, vec8 b) { return _mm256_or_ps(a, b); }
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const { float tBox = std::numeric_limits<float>::min(); if (!Intersects(ray, bb, tBox) || tBox > t) return nullptr; const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X); const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y); const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z); const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X); const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y); const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z); union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; }; for (int i = 0; i < count; i++) { // Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position; const __m256 e1X = edge1X8[i]; const __m256 e1Y = edge1Y8[i]; const __m256 e1Z = edge1Z8[i]; // Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position; const __m256 e2X = edge2X8[i]; const __m256 e2Y = edge2Y8[i]; const __m256 e2Z = edge2Z8[i]; // Vector3F p = ray.Direction.Cross(e2); const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y)); const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z)); const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X)); // float det = e1.Dot(p); const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ))); // if (det > -EPSILON && det < EPSILON) // return false; __m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS)); // float invDet = 1 / det; const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det); // Vector3F r = ray.Origin - triangle.Vertices[0].Position; const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]); const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]); const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]); // float u = r.Dot(p) * invDet; const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ)))); // if (u < 0 || u > 1) // return false; mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS)); // Vector3F q = r.Cross(e1); const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y)); const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z)); const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X)); // float v = ray.Direction.Dot(q) * invDet; const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ)))); // if (v < 0 || u + v > 1) // return false; mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS))); // float tt = e2.Dot(q) * invDet; const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ)))); // if (tt > EPSILON) // { // t = tt; // return true; // } // // return false; distances[i] = _mm256_and_ps(tt, mask); } Triangle* triangle = nullptr; for (int i = 0; i < count * NROFLANES; i++) if (dists[i] < t && dists[i] > EPSILON) { t = dists[i]; triangle = triangles[i]; } return triangle; }
INLINE const avxi operator |( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
/* natural logarithm computed for 8 simultaneous float return NaN for x <= 0 */ v8sf log256_ps(v8sf x) { v8si imm0; v8sf one = *(v8sf*)_ps256_1; //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps()); v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS); x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos); /* cut off denormalized stuff */ // can be done with AVX2 imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23); /* keep only the fractional part */ x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask); x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5); // this is again another AVX2 instruction imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f); v8sf e = _mm256_cvtepi32_ps(imm0); e = _mm256_add_ps(e, one); /* part2: if( x < SQRTHF ) { e -= 1; x = x + x - 1.0; } else { x = x - 1.0; } */ //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF); v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS); v8sf tmp = _mm256_and_ps(x, mask); x = _mm256_sub_ps(x, one); e = _mm256_sub_ps(e, _mm256_and_ps(one, mask)); x = _mm256_add_ps(x, tmp); v8sf z = _mm256_mul_ps(x,x); v8sf y = *(v8sf*)_ps256_cephes_log_p0; y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7); y = _mm256_mul_ps(y, x); y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8); y = _mm256_mul_ps(y, x); y = _mm256_mul_ps(y, z); tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1); y = _mm256_add_ps(y, tmp); tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5); y = _mm256_sub_ps(y, tmp); tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2); x = _mm256_add_ps(x, y); x = _mm256_add_ps(x, tmp); x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN return x; }