Exemple #1
void bumps_t::initialize (
  const bump_specifier_t & b0, const bump_specifier_t & b1)
  // Precompute the coefficients of four cubic polynomials in t, giving
  // the two smoothstep regions of the each of the two bump functions.
  v4f b0t = load4f (& b0.t0); // b0.t0 b0.t1 b0.t2 b0.t2
  v4f b1t = load4f (& b1.t0); // b1.t0 b1.t1 b1.t2 b1.t2
  v4f b0v = _mm_movelh_ps (load4f (& b0.v0), _mm_setzero_ps ()); // b0.v0 b0.v1
  v4f b1v = _mm_movelh_ps (load4f (& b1.v0), _mm_setzero_ps ()); // b1.v0 b1.v1
  v4f S = SHUFPS (b0t, b1t, (0, 2, 0, 2)); // b0.t0 b0.t2 b1.t0 b1.t2
  v4f T = SHUFPS (b0t, b1t, (1, 3, 1, 3)); // b0.t1 b0.t3 b1.t1 b1.t3
  v4f U = SHUFPS (b0v, b1v, (0, 2, 0, 2)); // b0.v0   0   b1.v0   0
  v4f V1 = SHUFPS (b0v, b1v, (1, 0, 1, 0)); // b0.v1 b0.v0 b1.v1 b1.v0
  v4f V2 = SHUFPS (b0v, b1v, (2, 1, 2, 1)); //   0   b0.v1   0   b1.v1
  v4f V = V1 - V2;
  v4f d = T - S;
  v4f a = T + S;
  v4f m = (V - U) / (d * d * d);
  store4f (c [0], U + m * S * S * (a + d + d));
  store4f (c [1], _mm_set1_ps (-6.0f) * m * S * T);
  store4f (c [2], _mm_set1_ps (+3.0f) * m * a);
  store4f (c [3], _mm_set1_ps (-2.0f) * m);
  store4f (S0, S);
  store4f (T0, T);
  store4f (U0, U);
  store4f (V0, V);
Exemple #2
v4f step_t::operator () (float t) const
  // Evaluate the polynomial f by Estrin's method. Return
  //   (0 0 0 0)  if t < t0,
  //   (f f f f)  if t0 <= t < t1,
  //   (1 1 1 1)  if t > t1.
  v4f c4 = load4f (c);
  v4f one = { 1.0f, 1.0f, 1.0f, 1.0f };
  v4f tttt = _mm_set1_ps (t);           // t t t t
  v4f tt = _mm_unpacklo_ps (one, tttt); // 1 t 1 t
  v4f f0 = c4 * tt;                     // c0 c1*t c2 c3*t
  v4f ha = _mm_hadd_ps (f0, f0) * tt * tt;
  v4f f = _mm_hadd_ps (ha, ha);         // f f f f
  v4f f1 = _mm_unpacklo_ps (f, one);    // f 1 f 1
  v4f tx = load4f (T);                  // t0  t1 t1 inf
  v4f lo = _mm_movelh_ps (tx, tx);      // t0  t1 t0  t1
  v4f hi = _mm_movehl_ps (tx, tx);      // t1 inf t1 inf
  v4f sel = _mm_and_ps (_mm_cmpge_ps (tttt, lo), _mm_cmplt_ps (tttt, hi));
  v4f val = _mm_and_ps (sel, f1);       // f? 1? f? 1?
  return _mm_hadd_ps (val, val);
Exemple #3
// Returns { f, g, f, g }, where f = bump0 (t), g = bump1 (t).
v4f bumps_t::operator () (float t) const
  // Compute all four polynomials by Estrin's method, and mask and combine the
  // values according to the region of the graph to which t belongs.
  v4f s = _mm_set1_ps (t);
  v4f S = load4f (S0);
  v4f T = load4f (T0);
  v4f U = load4f (U0);
  v4f V = load4f (V0);
  v4f f01 = load4f (c [0]) + load4f (c [1]) * s;
  v4f f12 = load4f (c [2]) + load4f (c [3]) * s;
  v4f f = f01 + f12 * s * s;
  v4f ltS = _mm_cmplt_ps (s, S);
  v4f geT = _mm_cmpge_ps (s, T);
  v4f x1 = _mm_andnot_ps (_mm_or_ps (ltS, geT), f);
  v4f x2 = _mm_and_ps (ltS, U);
  v4f x3 = _mm_and_ps (geT, V);
  v4f val = _mm_or_ps (_mm_or_ps (x1, x2), x3);
  return _mm_hadd_ps (val, val);
Exemple #4
void step_t::initialize (float t0, float t1)
  // Precompute the coefficents c of the cubic polynomial f
  // such that f(t0)=0, f(t1)=1, f'(t0)=0 and f'(t0)=1.
  float d = t1 - t0;
  float a = t1 + t0;
  c [0] = t0 * t0 * (a + d + d);
  c [1] = -6 * t0 * t1;
  c [2] = 3 * a;
  c [3] = -2;

  // Divide c [] by d^3.
  v4f dt = _mm_set1_ps (d);
  store4f (c, load4f (c) / (dt * dt * dt));

  T [0] = t0;
  T [1] = t1;
  T [2] = t1;
  T [3] = std::numeric_limits <float>::infinity ();
    void BVH4Intersector1<PrimitiveIntersector>::intersect(const BVH4* bvh, Ray& ray)
      /*! stack state */
      StackItemInt32<NodeRef> stack[stackSize];  //!< stack of nodes 
      StackItemInt32<NodeRef>* stackPtr = stack+1;        //!< current stack pointer
      StackItemInt32<NodeRef>* stackEnd = stack+stackSize;
      stack[0].ptr = bvh->root;
      stack[0].dist = neg_inf;
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray.dir.x >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray.dir.y >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray.dir.z >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
#if 0 // FIXME: why is this slower
      /*! load the ray */
      Vec3fa ray_org = ray.org;
      Vec3fa ray_dir = ray.dir;
      ssef ray_near  = max(ray.tnear,FLT_MIN); // we do not support negative tnear values in this kernel due to integer optimizations
      ssef ray_far   = ray.tfar; 
#if defined(__FIX_RAYS__)
      const float float_range = 0.1f*FLT_MAX;
      ray_org = clamp(ray_org,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_dir = clamp(ray_dir,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_far = min(ray_far,float(inf)); 
      const Vec3fa ray_rdir = rcp_safe(ray_dir);
      const sse3f org(ray_org), dir(ray_dir);
      const sse3f norg(-ray_org), rdir(ray_rdir), org_rdir(ray_org*ray_rdir);
      /*! load the ray into SIMD registers */
      const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const ssef  ray_near(ray.tnear);
      ssef ray_far(ray.tfar);

      /* pop loop */
      while (true) pop:
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        NodeRef cur = NodeRef(stackPtr->ptr);
        /*! if popped node is too far, pop next one */
        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
        /* downtraversal loop */
        while (true)
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;

#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray_near));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray_far ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
          const ssef tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          if (likely(mask == 0)) {
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
          /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          c = node->child(r); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
        /*! this is a leaf node */
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        ray_far = ray.tfar;
    void BVH4Intersector1<PrimitiveIntersector>::occluded(const BVH4* bvh, Ray& ray)
      /*! stack state */
      NodeRef stack[stackSize];  //!< stack of nodes that still need to get traversed
      NodeRef* stackPtr = stack+1;        //!< current stack pointer
      NodeRef* stackEnd = stack+stackSize;
      stack[0] = bvh->root;
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray.dir.x >= 0 ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray.dir.y >= 0 ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray.dir.z >= 0 ? 4*sizeof(ssef) : 5*sizeof(ssef);
#if 0 // FIXME: why is this slower
      /*! load the ray */
      Vec3fa ray_org = ray.org;
      Vec3fa ray_dir = ray.dir;
      ssef ray_near  = max(ray.tnear,FLT_MIN); // we do not support negative tnear values in this kernel due to integer optimizations
      ssef ray_far   = ray.tfar; 
#if defined(__FIX_RAYS__)
      const float float_range = 0.1f*FLT_MAX;
      ray_org = clamp(ray_org,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_dir = clamp(ray_dir,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_far = min(ray_far,float(inf)); 
      const Vec3fa ray_rdir = rcp_safe(ray_dir);
      const sse3f org(ray_org), dir(ray_dir);
      const sse3f norg(-ray_org), rdir(ray_rdir), org_rdir(ray_org*ray_rdir);
      /*! load the ray into SIMD registers */
      const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const ssef  ray_near(ray.tnear);
      ssef ray_far(ray.tfar);
      /* pop loop */
      while (true) pop:
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        NodeRef cur = (NodeRef) *stackPtr;
        /* downtraversal loop */
        while (true)
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;
#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray_near));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray_far ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
          const ssef tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd);
            if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
            else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
          assert(stackPtr < stackEnd);
          *stackPtr = c0; stackPtr++;
          assert(stackPtr < stackEnd);
          *stackPtr = c1; stackPtr++;
          /*! three children are hit */
          r = __bscf(mask);
          cur = node->child(r); 
          assert(cur != BVH4::emptyNode);
          if (likely(mask == 0)) continue;
          assert(stackPtr < stackEnd);
          *stackPtr = cur; stackPtr++;
          /*! four children are hit */
          cur = node->child(3);
          assert(cur != BVH4::emptyNode);
        /*! this is a leaf node */
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        if (PrimitiveIntersector::occluded(ray,prim,num,bvh->geometry)) {
          ray.geomID = 0;
    __forceinline bool BVH4Intersector4Hybrid<PrimitiveIntersector4>::occluded1(const BVH4* bvh, NodeRef root, size_t k, Ray4& ray, 
                                                                                const sse3f& ray_org, const sse3f& ray_dir, const sse3f& ray_rdir, 
                                                                                const ssef& ray_tnear, const ssef& ray_tfar)
      /*! stack state */
      NodeRef stack[stackSizeSingle];  //!< stack of nodes that still need to get traversed
      NodeRef* stackPtr = stack+1;        //!< current stack pointer
      NodeRef* stackEnd = stack+stackSizeSingle;
      stack[0]  = root;
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray_dir.x[k] >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray_dir.y[k] >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray_dir.z[k] >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
      /*! load the ray into SIMD registers */
      const sse3f org (ray_org .x[k],ray_org .y[k],ray_org .z[k]);
      const sse3f rdir(ray_rdir.x[k],ray_rdir.y[k],ray_rdir.z[k]);
      const sse3f norg = -org, org_rdir(org*rdir);
      const ssef rayNear(ray_tnear[k]), rayFar(ray_tfar[k]); 
      /* pop loop */
      while (true) pop:
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        NodeRef cur = (NodeRef) *stackPtr;
        /* downtraversal loop */
        while (true)
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;
#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,rayNear));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,rayFar ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
          const ssef tNear = max(tNearX,tNearY,tNearZ,rayNear);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const float d0 = tNear[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const float d1 = tNear[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd);
            if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
            else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
          assert(stackPtr < stackEnd);
          *stackPtr = c0; stackPtr++;
          assert(stackPtr < stackEnd);
          *stackPtr = c1; stackPtr++;
          /*! three children are hit */
          r = __bscf(mask);
          cur = node->child(r); 
          assert(cur != BVH4::emptyNode);
          if (likely(mask == 0)) continue;
          assert(stackPtr < stackEnd);
          *stackPtr = cur; stackPtr++;
          /*! four children are hit */
          cur = node->child(3);
          assert(cur != BVH4::emptyNode);
        /*! this is a leaf node */
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        if (PrimitiveIntersector4::occluded(ray,k,prim,num,bvh->geometry)) {
          ray.geomID[k] = 0;
          return true;
      return false;
    __forceinline void BVH4Intersector4Hybrid<PrimitiveIntersector4>::intersect1(const BVH4* bvh, NodeRef root, size_t k, Ray4& ray, 
                                                                                 const sse3f& ray_org, const sse3f& ray_dir, const sse3f& ray_rdir, 
                                                                                 const ssef& ray_tnear, const ssef& ray_tfar)
      /*! stack state */
      StackItem stack[stackSizeSingle];  //!< stack of nodes 
      StackItem* stackPtr = stack+1;        //!< current stack pointer
      StackItem* stackEnd = stack+stackSizeSingle;
      stack[0].ptr = root;
      stack[0].dist = neg_inf;
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray_dir.x[k] >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray_dir.y[k] >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray_dir.z[k] >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
      /*! load the ray into SIMD registers */
      const sse3f org (ray_org .x[k],ray_org .y[k],ray_org .z[k]);
      const sse3f rdir(ray_rdir.x[k],ray_rdir.y[k],ray_rdir.z[k]);
      const sse3f norg = -org, org_rdir(org*rdir);
      ssef rayNear(ray_tnear[k]), rayFar(ray_tfar[k]); 
      /* pop loop */
      while (true) pop:
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        NodeRef cur = NodeRef(stackPtr->ptr);
        /*! if popped node is too far, pop next one */
        if (unlikely(stackPtr->dist > ray.tfar[k]))
        /* downtraversal loop */
        while (true)
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;

#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,rayNear));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,rayFar ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
          const ssef tNear = max(tNearX,tNearY,tNearZ,rayNear);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const float d0 = tNear[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const float d1 = tNear[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); float d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          if (likely(mask == 0)) {
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
          /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          c = node->child(r); d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
        /*! this is a leaf node */
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        rayFar = ray.tfar[k];
void make_system (unsigned q, unsigned r, const float (& xyz_in) [3] [4],
  float (* nodes) [4], std::uint8_t (* indices) [6])
  std::uint8_t * P, * Q, * R; // Permutations taking triangles around nodes.
  std::uint8_t * Qi;          // Inverse of the permutation Q.
  std::uint8_t * Px, * Rx;    // Map a triangle to its P- or R-node.

  std::uint8_t memory [360];
  std::uint8_t * memp = memory;
  P = memp; memp += 60;
  Q = memp; memp += 60;
  R = memp; memp += 60;
  Qi = memp; memp += 60;
  Px = memp; memp += 60;
  Rx = memp; // memp += 60;

  const std::uint8_t undef = 0xff;
  const unsigned p = 2, N = 2 * p * q * r / (q * r + r * p + p * q - p * q * r);
  for (unsigned n = 0; n != sizeof memory; ++ n) memory [n] = undef;
  for (unsigned n = 0; n != N; ++ n) {
    n [Q] = n - n % q + (n + 1) % q;
    n [Q] [Qi] = n;

  unsigned next_node = N / q;

  // We are given the coordinates of the P-, Q- and R-nodes in triangle 0.
  store4f (nodes [Px [0] = next_node ++], load4f (xyz_in [0]));
  store4f (nodes [0], load4f (xyz_in [1]));
  store4f (nodes [Rx [0] = next_node ++], load4f (xyz_in [2]));

  float two_pi = 0x1.921fb6P+002f;
  float A = two_pi / ui2f (p);
  float B = two_pi / ui2f (q);

  unsigned n0 = 0, m0 = 0;

  while ([& m0, Px, Rx, q, r, nodes, & next_node, N, B] () -> bool {
    // Calculate coordinates of any unknown P- and R-nodes around Q-node m0.
    ALIGNED16 rotor_t Y_rotate (nodes [m0 / q], B);
    for (unsigned n = m0 + 1; n != m0 + q; ++ n) {
      if (Px [n] == undef) {
        Px [n] = next_node;
        Y_rotate (nodes [Px [n - 1]], nodes [next_node]);
        ++ next_node;
      if (Rx [n] == undef) {
        Rx [n] = next_node;
        Y_rotate (nodes [Rx [n - 1]], nodes [next_node]);
        ++ next_node;
    m0 += q;
    return m0 != N;
  } ()) {
    while (n0 [P] != undef) ++ n0;

    // Attach triangle m0 to triangle n0's dangling P-node.
    // At this point we learn the coordinates of the next Q-node.
    Px [m0] = Px [n0];
    ALIGNED16 rotor_t X_rotate (nodes [Px [n0]], A);
    X_rotate (nodes [n0 / q], nodes [m0 / q]);

    // Work out the consequences of attaching the new triangle.
    // Invariant: n [P] = m if and only if m [Q] [R] = n, for all m, n < N.
    unsigned n = n0, m = m0;
    do {
      n [P] = m;
      m = m [Q];
      m [R] = n;
      Rx [m] = Rx [n] = Rx [m] & Rx [n];
      unsigned d = 1;
      while (d != r && n [R] != undef) {
        n = n [R];
        ++ d;
      while (d != r && m [P] != undef) {
        m = m [P] [Q];
        ++ d;
      if (d == r - 1) {
        n [R] = m;
        n = n [Qi];
        m [P] = n;
        Px [m] = Px [n] = Px [m] & Px [n];
      if (n [P] != undef) {
        n = m0;
        m = n0;
    } while (n [P] == undef);

  for (unsigned n = 0; n != N; ++ n) {
    unsigned i = n;
    unsigned j = i [R];
    unsigned k = j [P];
    indices [n] [0] = Px [j];
    indices [n] [1] = j / q;
    indices [n] [2] = Rx [i];
    indices [n] [3] = Px [i];
    indices [n] [4] = k / q;
    indices [n] [5] = Rx [k];
    void BVH4Intersector1Bezier<PrimitiveIntersector>::intersect(const BVH4* bvh, Ray& ray)
      /*! perform per ray precalculations required by the primitive intersector */
      Precalculations pre(ray);

      /*! stack state */
      StackItemInt32<NodeRef> stack[stackSize];  //!< stack of nodes 
      StackItemInt32<NodeRef>* stackPtr = stack+1;        //!< current stack pointer
      StackItemInt32<NodeRef>* stackEnd = stack+stackSize;
      stack[0].ptr = bvh->root;
      stack[0].dist = neg_inf;
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray.dir.x >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray.dir.y >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray.dir.z >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
      /*! load the ray into SIMD registers */
      const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const ssef  ray_near(ray.tnear);
      ssef ray_far(ray.tfar);

      /* pop loop */
      while (true) pop:
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        NodeRef cur = NodeRef(stackPtr->ptr);
        /*! if popped node is too far, pop next one */
        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
        /* downtraversal loop */
        while (true)
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;

          const ssef tFarX0  = abs((norg.x + load4f((const char*)node+farX )) * rdir.x);
          const ssef tFarY0  = abs((norg.y + load4f((const char*)node+farY )) * rdir.y);
          const ssef tFarZ0  = abs((norg.z + load4f((const char*)node+farZ )) * rdir.z);
          const ssef tFar0  = min(tFarX0 ,tFarY0 ,tFarZ0);
          const ssef radius = abs(ssef(ray.org.w) + tFar0 * ssef(ray.dir.w));
          //const ssef radius = zero;

          const ssef tLowerX = (norg.x + node->lower_x - radius) * rdir.x;
          const ssef tLowerY = (norg.y + node->lower_y - radius) * rdir.y;
          const ssef tLowerZ = (norg.z + node->lower_z - radius) * rdir.z;

          const ssef tUpperX = (norg.x + node->upper_x + radius) * rdir.x;
          const ssef tUpperY = (norg.y + node->upper_y + radius) * rdir.y;
          const ssef tUpperZ = (norg.z + node->upper_z + radius) * rdir.z;

          const ssef tNearX = min(tLowerX,tUpperX);
          const ssef tNearY = min(tLowerY,tUpperY);
          const ssef tNearZ = min(tLowerZ,tUpperZ);

          const ssef tFarX = max(tLowerX,tUpperX);
          const ssef tFarY = max(tLowerY,tUpperY);
          const ssef tFarZ = max(tLowerZ,tUpperZ);

          const ssef tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            //assert(cur != BVH4::emptyNode); // FIXME: enable these assertions again, currently traversing empty children
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          //assert(c0 != BVH4::emptyNode);
          //assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          //assert(c != BVH4::emptyNode);
          if (likely(mask == 0)) {
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
          /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          c = node->child(r); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          //assert(c != BVH4::emptyNode);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
        /*! this is a leaf node */
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        ray_far = ray.tfar;