Example #1
0
/*
  SIMD selection
*/
__SIMD _SIMD_sel_ps(__SIMD a, __SIMD b, void** resultPtr)
{
#ifdef  USE_SSE
  __SIMD* result = (__SIMD*) (*resultPtr);
  return _mm_or_ps(_mm_andnot_ps(*result,a),_mm_and_ps(*result,b));
#elif defined USE_AVX
  __SIMD* result = (__SIMD*) resultPtr;
  return _mm256_or_ps(_mm256_andnot_ps(*result,a),_mm256_and_ps(*result,b));
#elif defined USE_IBM
  return vec_sel(a,b,c);
#endif
}
Example #2
0
  //this method is untested as of right now....
 inline void set_union(Set<bitset> *A_in, Set<bitset> *B_in){
    if(A_in->number_of_bytes > 0 && B_in->number_of_bytes > 0){
      const uint64_t *a_index = (uint64_t*) A_in->data;
      const uint64_t *b_index = (uint64_t*) B_in->data;

      uint64_t* A = (uint64_t*)(A_in->data+sizeof(uint64_t));
      uint64_t* B = (uint64_t*)(B_in->data+sizeof(uint64_t));
      const size_t s_a = ((A_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t));
      const size_t s_b = ((B_in->number_of_bytes-sizeof(uint64_t))/sizeof(uint64_t));

      const bool a_big = a_index[0] > b_index[0];

      assert(a_index[0] <= b_index[0]);

      const uint64_t start_index = (a_big) ? a_index[0] : b_index[0];
      const uint64_t a_start_index = (a_big) ? 0:(b_index[0]-a_index[0]);
      const uint64_t b_start_index = (a_big) ? (a_index[0]-b_index[0]):0;

      const uint64_t end_index = ((a_index[0]+s_a) > (b_index[0]+s_b)) ? (b_index[0]+s_b):(a_index[0]+s_a);
      const uint64_t total_size = (start_index > end_index) ? 0:(end_index-start_index);

      //16 uint16_ts
      //8 ints
      //4 longs
      size_t i = 0;
      A += a_start_index;
      B += b_start_index;
      #if VECTORIZE == 1
      for(; (i+3) < total_size; i += 4, A += 4, B += 4){
        const __m256 a1 = _mm256_loadu_ps((const float*)A);
        const __m256 a2 = _mm256_loadu_ps((const float*)B);
        const __m256 r = _mm256_or_ps(a2, a1);

        _mm256_storeu_ps((float*)A, r);
      }
      #endif

      for(; i < total_size; i++, A++, B++){
        *A |= *B;
      }
    }
  }
Example #3
0
inline vec8 operator|(vec8 a, vec8 b) { return _mm256_or_ps(a, b); }
Example #4
0
Triangle* OctreeLeaf::Query(const Ray& ray, float& t) const
{
	float tBox = std::numeric_limits<float>::min();
	if (!Intersects(ray, bb, tBox) || tBox > t)
		return nullptr;

	const __m256 rayDirX = _mm256_set1_ps(ray.Direction.X);
	const __m256 rayDirY = _mm256_set1_ps(ray.Direction.Y);
	const __m256 rayDirZ = _mm256_set1_ps(ray.Direction.Z);

	const __m256 rayPosX = _mm256_set1_ps(ray.Origin.X);
	const __m256 rayPosY = _mm256_set1_ps(ray.Origin.Y);
	const __m256 rayPosZ = _mm256_set1_ps(ray.Origin.Z);

	union { float dists[MAXSIZE]; __m256 distances[MAXSIZE / NROFLANES]; };

	for (int i = 0; i < count; i++)
	{
		// Vector3F e1 = triangle.Vertices[1].Position - triangle.Vertices[0].Position;
		const __m256 e1X = edge1X8[i];
		const __m256 e1Y = edge1Y8[i];
		const __m256 e1Z = edge1Z8[i];

		// Vector3F e2 = triangle.Vertices[2].Position - triangle.Vertices[0].Position;
		const __m256 e2X = edge2X8[i];
		const __m256 e2Y = edge2Y8[i];
		const __m256 e2Z = edge2Z8[i];

		// Vector3F p = ray.Direction.Cross(e2);
		const __m256 pX = _mm256_sub_ps(_mm256_mul_ps(rayDirY, e2Z), _mm256_mul_ps(rayDirZ, e2Y));
		const __m256 pY = _mm256_sub_ps(_mm256_mul_ps(rayDirZ, e2X), _mm256_mul_ps(rayDirX, e2Z));
		const __m256 pZ = _mm256_sub_ps(_mm256_mul_ps(rayDirX, e2Y), _mm256_mul_ps(rayDirY, e2X));

		// float det = e1.Dot(p);
		const __m256 det = _mm256_add_ps(_mm256_mul_ps(e1X, pX), _mm256_add_ps(_mm256_mul_ps(e1Y, pY), _mm256_mul_ps(e1Z, pZ)));

		// if (det > -EPSILON && det < EPSILON)
		//     return false;
		__m256 mask = _mm256_or_ps(_mm256_cmp_ps(det, _mm256_set1_ps(-EPSILON), _CMP_LE_OS), _mm256_cmp_ps(det, _mm256_set1_ps(EPSILON), _CMP_GE_OS));

		// float invDet = 1 / det;
		const __m256 invDet = _mm256_div_ps(_mm256_set1_ps(1.0f), det);

		// Vector3F r = ray.Origin - triangle.Vertices[0].Position;
		const __m256 rX = _mm256_sub_ps(rayPosX, vert0X8[i]);
		const __m256 rY = _mm256_sub_ps(rayPosY, vert0Y8[i]);
		const __m256 rZ = _mm256_sub_ps(rayPosZ, vert0Z8[i]);

		// float u = r.Dot(p) * invDet;
		const __m256 u = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rX, pX), _mm256_add_ps(_mm256_mul_ps(rY, pY), _mm256_mul_ps(rZ, pZ))));

		// if (u < 0 || u > 1)
		//	   return false;
		mask = _mm256_and_ps(mask, _mm256_cmp_ps(u, _mm256_setzero_ps(), _CMP_GE_OS));

		// Vector3F q = r.Cross(e1);
		const __m256 qX = _mm256_sub_ps(_mm256_mul_ps(rY, e1Z), _mm256_mul_ps(rZ, e1Y));
		const __m256 qY = _mm256_sub_ps(_mm256_mul_ps(rZ, e1X), _mm256_mul_ps(rX, e1Z));
		const __m256 qZ = _mm256_sub_ps(_mm256_mul_ps(rX, e1Y), _mm256_mul_ps(rY, e1X));

		// float v = ray.Direction.Dot(q) * invDet;
		const __m256 v = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(rayDirX, qX), _mm256_add_ps(_mm256_mul_ps(rayDirY, qY), _mm256_mul_ps(rayDirZ, qZ))));

		// if (v < 0 || u + v > 1)
		//     return false;
		mask = _mm256_and_ps(mask, _mm256_and_ps(_mm256_cmp_ps(v, _mm256_setzero_ps(), _CMP_GE_OS), _mm256_cmp_ps(_mm256_add_ps(u, v), _mm256_set1_ps(1.0f), _CMP_LE_OS)));

		// float tt = e2.Dot(q) * invDet;
		const __m256 tt = _mm256_mul_ps(invDet, _mm256_add_ps(_mm256_mul_ps(e2X, qX), _mm256_add_ps(_mm256_mul_ps(e2Y, qY), _mm256_mul_ps(e2Z, qZ))));

		// if (tt > EPSILON)
		// {
		//     t = tt;
		//     return true;
		// }
		//
		// return false;
		distances[i] = _mm256_and_ps(tt, mask);
	}

	Triangle* triangle = nullptr;
	for (int i = 0; i < count * NROFLANES; i++)
		if (dists[i] < t && dists[i] > EPSILON)
		{
			t = dists[i];
			triangle = triangles[i];
		}

	return triangle;
}
Example #5
0
 INLINE const avxi operator  |( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_or_ps (_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
Example #6
0
/* natural logarithm computed for 8 simultaneous float 
   return NaN for x <= 0
*/
v8sf log256_ps(v8sf x) {
  v8si imm0;
  v8sf one = *(v8sf*)_ps256_1;

  //v8sf invalid_mask = _mm256_cmple_ps(x, _mm256_setzero_ps());
  v8sf invalid_mask = _mm256_cmp_ps(x, _mm256_setzero_ps(), _CMP_LE_OS);

  x = _mm256_max_ps(x, *(v8sf*)_ps256_min_norm_pos);  /* cut off denormalized stuff */

  // can be done with AVX2
  imm0 = _mm256_srli_epi32(_mm256_castps_si256(x), 23);

  /* keep only the fractional part */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_mant_mask);
  x = _mm256_or_ps(x, *(v8sf*)_ps256_0p5);

  // this is again another AVX2 instruction
  imm0 = _mm256_sub_epi32(imm0, *(v8si*)_pi32_256_0x7f);
  v8sf e = _mm256_cvtepi32_ps(imm0);

  e = _mm256_add_ps(e, one);

  /* part2: 
     if( x < SQRTHF ) {
       e -= 1;
       x = x + x - 1.0;
     } else { x = x - 1.0; }
  */
  //v8sf mask = _mm256_cmplt_ps(x, *(v8sf*)_ps256_cephes_SQRTHF);
  v8sf mask = _mm256_cmp_ps(x, *(v8sf*)_ps256_cephes_SQRTHF, _CMP_LT_OS);
  v8sf tmp = _mm256_and_ps(x, mask);
  x = _mm256_sub_ps(x, one);
  e = _mm256_sub_ps(e, _mm256_and_ps(one, mask));
  x = _mm256_add_ps(x, tmp);

  v8sf z = _mm256_mul_ps(x,x);

  v8sf y = *(v8sf*)_ps256_cephes_log_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p5);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p6);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p7);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_log_p8);
  y = _mm256_mul_ps(y, x);

  y = _mm256_mul_ps(y, z);
  
  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q1);
  y = _mm256_add_ps(y, tmp);


  tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);

  tmp = _mm256_mul_ps(e, *(v8sf*)_ps256_cephes_log_q2);
  x = _mm256_add_ps(x, y);
  x = _mm256_add_ps(x, tmp);
  x = _mm256_or_ps(x, invalid_mask); // negative arg will be NAN
  return x;
}