void BVH4mbIntersector16Hybrid<LeafIntersector>::intersect(int16* valid_i, BVH4mb* bvh, Ray16& ray16)
      /* near and node stack */
      __aligned(64) float16   stack_dist[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node_single[3*BVH4i::maxDepth+1]; 

      /* load ray */
      const bool16 valid0     = *(int16*)valid_i != int16(0);
      const Vec3f16 rdir16     = rcp_safe(ray16.dir);
      const Vec3f16 org_rdir16 = ray16.org * rdir16;
      float16 ray_tnear        = select(valid0,ray16.tnear,pos_inf);
      float16 ray_tfar         = select(valid0,ray16.tfar ,neg_inf);
      const float16 inf        = float16(pos_inf);
      /* allocate stack and push root node */
      stack_node[0] = BVH4i::invalidNode;
      stack_dist[0] = inf;
      stack_node[1] = bvh->root;
      stack_dist[1] = ray_tnear; 
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      float16*   __restrict__ sptr_dist = stack_dist + 2;
      const Node      * __restrict__ nodes = (Node     *)bvh->nodePtr();
      const BVH4mb::Triangle01 * __restrict__ accel = (BVH4mb::Triangle01 *)bvh->triPtr();

      while (1) pop:
        /* pop next node from stack */
        NodeRef curNode = *(sptr_node-1);
        float16 curDist   = *(sptr_dist-1);
	const bool16 m_stackDist = ray_tfar > curDist;

	/* stack emppty ? */
        if (unlikely(curNode == BVH4i::invalidNode))  break;
        /* cull node if behind closest hit point */
        if (unlikely(none(m_stackDist))) continue;

	/* switch to single ray mode */
        if (unlikely(countbits(m_stackDist) <= BVH4i::hybridSIMDUtilSwitchThreshold)) 
	    float   *__restrict__ stack_dist_single = (float*)sptr_dist;

	    /* traverse single ray */	  	  
	    long rayIndex = -1;
	    while((rayIndex = bitscan64(rayIndex,m_stackDist)) != BITSCAN_NO_BIT_SET_64) 
		stack_node_single[0] = BVH4i::invalidNode;
		stack_node_single[1] = curNode;
		size_t sindex = 2;

		const float16 org_xyz      = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z);
		const float16 dir_xyz      = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z);
		const float16 rdir_xyz     = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z);
		const float16 org_rdir_xyz = org_xyz * rdir_xyz;
		const float16 min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]);
		float16       max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]);
		const float16 time         = broadcast1to16f(&ray16.time[rayIndex]);

		const unsigned int leaf_mask = BVH4I_LEAF_MASK;

		while (1) 
		    NodeRef curNode = stack_node_single[sindex-1];

		    /* return if stack is empty */
		    if (unlikely(curNode == BVH4i::invalidNode)) break;

		    /* intersect one ray against four triangles */
		    const bool hit = LeafIntersector::intersect(curNode,
		    if (hit)

	    ray_tfar = select(valid0,ray16.tfar ,neg_inf);


	const unsigned int leaf_mask = BVH4I_LEAF_MASK;

	const float16 time     = ray16.time;
	const float16 one_time = (float16::one() - time);

        while (1)
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf(leaf_mask))) break;
          STAT3(normal.trav_nodes,1,popcnt(ray_tfar > curDist),16);
          const Node* __restrict__ const node = curNode.node(nodes);
          const BVH4mb::Node* __restrict__ const nodeMB = (BVH4mb::Node*)node;

          /* pop of next node */
          curNode = *sptr_node; 
          curDist = *sptr_dist;
	  prefetch<PFHINT_L1>((char*)node + 0*64); 
	  prefetch<PFHINT_L1>((char*)node + 1*64); 
	  prefetch<PFHINT_L1>((char*)node + 2*64); 
	  prefetch<PFHINT_L1>((char*)node + 3*64); 

#pragma unroll(4)
          for (unsigned int i=0; i<4; i++)
	    const NodeRef child = node->lower[i].child;

	    const float16 lower_x =  one_time * nodeMB->lower[i].x + time * nodeMB->lower_t1[i].x;
	    const float16 lower_y =  one_time * nodeMB->lower[i].y + time * nodeMB->lower_t1[i].y;
	    const float16 lower_z =  one_time * nodeMB->lower[i].z + time * nodeMB->lower_t1[i].z;
	    const float16 upper_x =  one_time * nodeMB->upper[i].x + time * nodeMB->upper_t1[i].x;
	    const float16 upper_y =  one_time * nodeMB->upper[i].y + time * nodeMB->upper_t1[i].y;
	    const float16 upper_z =  one_time * nodeMB->upper[i].z + time * nodeMB->upper_t1[i].z;

	    if (unlikely(i >=2 && child == BVH4i::invalidNode)) break;

            const float16 lclipMinX = msub(lower_x,rdir16.x,org_rdir16.x);
            const float16 lclipMinY = msub(lower_y,rdir16.y,org_rdir16.y);
            const float16 lclipMinZ = msub(lower_z,rdir16.z,org_rdir16.z);
            const float16 lclipMaxX = msub(upper_x,rdir16.x,org_rdir16.x);
            const float16 lclipMaxY = msub(upper_y,rdir16.y,org_rdir16.y);
            const float16 lclipMaxZ = msub(upper_z,rdir16.z,org_rdir16.z);
            const float16 lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const float16 lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const bool16 lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);   
	    const float16 childDist = select(lhit,lnearP,inf);
            const bool16 m_child_dist = childDist < curDist;
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
              /* push cur node onto stack and continue with hit child */
              if (any(m_child_dist))
                *(sptr_node-1) = curNode;
                *(sptr_dist-1) = curDist; 
                curDist = childDist;
                curNode = child;
              /* push hit child onto stack*/
		  *(sptr_node-1) = child;
		  *(sptr_dist-1) = childDist; 
              assert(sptr_node - stack_node < BVH4i::maxDepth);
	  const bool16 curUtil = ray_tfar > curDist;
	  if (unlikely(countbits(curUtil) <= BVH4i::hybridSIMDUtilSwitchThreshold))
	      *sptr_node++ = curNode;
	      *sptr_dist++ = curDist; 
	      goto pop;

        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) break;

        /* intersect leaf */
        const bool16 m_valid_leaf = ray_tfar > curDist;


        ray_tfar = select(m_valid_leaf,ray16.tfar,ray_tfar);
    void BVH4iIntersector16Hybrid<LeafIntersector,ENABLE_COMPRESSED_BVH4I_NODES>::intersect(mic_i* valid_i, BVH4i* bvh, Ray16& ray16)
      /* near and node stack */
      __aligned(64) mic_f   stack_dist[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node_single[3*BVH4i::maxDepth+1]; 

      /* load ray */
      const mic_m valid0     = *(mic_i*)valid_i != mic_i(0);
      const mic3f rdir16     = rcp_safe(ray16.dir);
      const mic3f org_rdir16 = ray16.org * rdir16;
      mic_f ray_tnear        = select(valid0,ray16.tnear,pos_inf);
      mic_f ray_tfar         = select(valid0,ray16.tfar ,neg_inf);
      const mic_f inf        = mic_f(pos_inf);
      /* allocate stack and push root node */
      stack_node[0] = BVH4i::invalidNode;
      stack_dist[0] = inf;
      stack_node[1] = bvh->root;
      stack_dist[1] = ray_tnear; 
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      mic_f*   __restrict__ sptr_dist = stack_dist + 2;
      const Node      * __restrict__ nodes = (Node     *)bvh->nodePtr();
      const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr();

      while (1) pop:
        /* pop next node from stack */
        NodeRef curNode = *(sptr_node-1);
        mic_f curDist   = *(sptr_dist-1);
	const mic_m m_stackDist = ray_tfar > curDist;

	/* stack emppty ? */
        if (unlikely(curNode == BVH4i::invalidNode))  break;
        /* cull node if behind closest hit point */
        if (unlikely(none(m_stackDist))) continue;

	/* switch to single ray mode */
        if (unlikely(countbits(m_stackDist) <= BVH4i::hybridSIMDUtilSwitchThreshold)) 
	    float   *__restrict__ stack_dist_single = (float*)sptr_dist;

	    /* traverse single ray */	  	  
	    long rayIndex = -1;
	    while((rayIndex = bitscan64(rayIndex,m_stackDist)) != BITSCAN_NO_BIT_SET_64) 
		stack_node_single[0] = BVH4i::invalidNode;
		stack_node_single[1] = curNode;
		size_t sindex = 2;

		const mic_f org_xyz      = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z);
		const mic_f dir_xyz      = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z);
		const mic_f rdir_xyz     = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z);
		const mic_f org_rdir_xyz = org_xyz * rdir_xyz;
		const mic_f min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]);
		mic_f       max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]);

		const unsigned int leaf_mask = BVH4I_LEAF_MASK;

		while (1) 
		    NodeRef curNode = stack_node_single[sindex-1];

		    /* return if stack is empty */
		    if (unlikely(curNode == BVH4i::invalidNode)) break;

		    /* intersect one ray against four triangles */

		    const bool hit = LeafIntersector::intersect(curNode,

		    if (hit)
	    ray_tfar = select(valid0,ray16.tfar ,neg_inf);


	const unsigned int leaf_mask = BVH4I_LEAF_MASK;

	const mic3f org = ray16.org;
	const mic3f dir = ray16.dir;

        while (1)
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf(leaf_mask))) break;
          STAT3(normal.trav_nodes,1,popcnt(ray_tfar > curDist),16);
          const Node* __restrict__ const node = curNode.node(nodes);

	  prefetch<PFHINT_L1>((mic_f*)node + 0); 
	  prefetch<PFHINT_L1>((mic_f*)node + 1); 
          /* pop of next node */
          curNode = *sptr_node; 
          curDist = *sptr_dist;

#pragma unroll(4)
          for (unsigned int i=0; i<4; i++)
	    BVH4i::NodeRef child;
	    mic_f lclipMinX,lclipMinY,lclipMinZ;
	    mic_f lclipMaxX,lclipMaxY,lclipMaxZ;

		child = node->lower[i].child;

		lclipMinX = msub(node->lower[i].x,rdir16.x,org_rdir16.x);
		lclipMinY = msub(node->lower[i].y,rdir16.y,org_rdir16.y);
		lclipMinZ = msub(node->lower[i].z,rdir16.z,org_rdir16.z);
		lclipMaxX = msub(node->upper[i].x,rdir16.x,org_rdir16.x);
		lclipMaxY = msub(node->upper[i].y,rdir16.y,org_rdir16.y);
		lclipMaxZ = msub(node->upper[i].z,rdir16.z,org_rdir16.z);
		BVH4i::QuantizedNode* __restrict__ const compressed_node = (BVH4i::QuantizedNode*)node;
		child = compressed_node->child(i);

		const mic_f startXYZ = compressed_node->decompress_startXYZ();
		const mic_f diffXYZ  = compressed_node->decompress_diffXYZ();
		const mic_f clower   = compressed_node->decompress_lowerXYZ(startXYZ,diffXYZ);
		const mic_f cupper   = compressed_node->decompress_upperXYZ(startXYZ,diffXYZ);

		lclipMinX = msub(mic_f(clower[4*i+0]),rdir16.x,org_rdir16.x);
		lclipMinY = msub(mic_f(clower[4*i+1]),rdir16.y,org_rdir16.y);
		lclipMinZ = msub(mic_f(clower[4*i+2]),rdir16.z,org_rdir16.z);
		lclipMaxX = msub(mic_f(cupper[4*i+0]),rdir16.x,org_rdir16.x);
		lclipMaxY = msub(mic_f(cupper[4*i+1]),rdir16.y,org_rdir16.y);
		lclipMaxZ = msub(mic_f(cupper[4*i+2]),rdir16.z,org_rdir16.z);		
	    if (unlikely(i >=2 && child == BVH4i::invalidNode)) break;

            const mic_f lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const mic_f lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const mic_m lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);   
	    const mic_f childDist = select(lhit,lnearP,inf);
            const mic_m m_child_dist = childDist < curDist;

            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))

              /* push cur node onto stack and continue with hit child */
              if (any(m_child_dist))
                *(sptr_node-1) = curNode;
                *(sptr_dist-1) = curDist; 
                curDist = childDist;
                curNode = child;
              /* push hit child onto stack*/
		  *(sptr_node-1) = child;
		  *(sptr_dist-1) = childDist; 

		  const char* __restrict__ const pnode = (char*)child.node(nodes);             
		  prefetch<PFHINT_L2>(pnode + 0);
		  prefetch<PFHINT_L2>(pnode + 64);
              assert(sptr_node - stack_node < BVH4i::maxDepth);
	  const mic_m curUtil = ray_tfar > curDist;
	  if (unlikely(countbits(curUtil) <= BVH4i::hybridSIMDUtilSwitchThreshold))
	      *sptr_node++ = curNode;
	      *sptr_dist++ = curDist; 
	      goto pop;
        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) break;

        /* intersect leaf */
        const mic_m m_valid_leaf = ray_tfar > curDist;


        ray_tfar = select(m_valid_leaf,ray16.tfar,ray_tfar);
void BVH4iIntersector1::occluded(BVH4i* bvh, Ray& ray)
    /* near and node stack */
    __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1];

    /* setup */
    const mic3f rdir16      = rcp_safe(mic3f(ray.dir.x,ray.dir.y,ray.dir.z));
    const mic_f inf         = mic_f(pos_inf);
    const mic_f zero        = mic_f::zero();

    const Node      * __restrict__ nodes = (Node     *)bvh->nodePtr();
    const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr();

    stack_node[0] = BVH4i::invalidNode;
    stack_node[1] = bvh->root;
    size_t sindex = 2;

    const mic_f org_xyz      = loadAOS4to16f(ray.org.x,ray.org.y,ray.org.z);
    const mic_f dir_xyz      = loadAOS4to16f(ray.dir.x,ray.dir.y,ray.dir.z);
    const mic_f rdir_xyz     = loadAOS4to16f(rdir16.x[0],rdir16.y[0],rdir16.z[0]);
    const mic_f org_rdir_xyz = org_xyz * rdir_xyz;
    const mic_f min_dist_xyz = broadcast1to16f(&ray.tnear);
    const mic_f max_dist_xyz = broadcast1to16f(&ray.tfar);

    const unsigned int leaf_mask = BVH4I_LEAF_MASK;

    while (1)
        NodeRef curNode = stack_node[sindex-1];

        while (1)
            /* test if this is a leaf node */
            if (unlikely(curNode.isLeaf(leaf_mask))) break;

            const Node* __restrict__ const node = curNode.node(nodes);
            const float* __restrict const plower = (float*)node->lower;
            const float* __restrict const pupper = (float*)node->upper;

            prefetch<PFHINT_L1>((char*)node + 0);
            prefetch<PFHINT_L1>((char*)node + 64);

            /* intersect single ray with 4 bounding boxes */
            const mic_f tLowerXYZ = load16f(plower) * rdir_xyz - org_rdir_xyz;
            const mic_f tUpperXYZ = load16f(pupper) * rdir_xyz - org_rdir_xyz;
            const mic_f tLower = mask_min(0x7777,min_dist_xyz,tLowerXYZ,tUpperXYZ);
            const mic_f tUpper = mask_max(0x7777,max_dist_xyz,tLowerXYZ,tUpperXYZ);

            curNode = stack_node[sindex];

            const Node* __restrict__ const next = curNode.node(nodes);
            prefetch<PFHINT_L2>((char*)next + 0);
            prefetch<PFHINT_L2>((char*)next + 64);

            const mic_f tNear = vreduce_max4(tLower);
            const mic_f tFar  = vreduce_min4(tUpper);
            const mic_m hitm = le(0x8888,tNear,tFar);
            const mic_f tNear_pos = select(hitm,tNear,inf);

            /* if no child is hit, continue with early popped child */
            if (unlikely(none(hitm))) continue;

            const unsigned long hiti = toInt(hitm);
            const unsigned long pos_first = bitscan64(hiti);
            const unsigned long num_hitm = countbits(hiti);

            /* if a single child is hit, continue with that child */
            curNode = ((unsigned int *)plower)[pos_first];
            if (likely(num_hitm == 1)) continue;

            /* if two children are hit, push in correct order */
            const unsigned long pos_second = bitscan64(pos_first,hiti);
            if (likely(num_hitm == 2))
                const unsigned int dist_first  = ((unsigned int*)&tNear)[pos_first];
                const unsigned int dist_second = ((unsigned int*)&tNear)[pos_second];
                const unsigned int node_first  = curNode;
                const unsigned int node_second = ((unsigned int*)plower)[pos_second];

                if (dist_first <= dist_second)
                    stack_node[sindex] = node_second;
                    assert(sindex < 3*BVH4i::maxDepth+1);
                    stack_node[sindex] = curNode;
                    curNode = node_second;
                    assert(sindex < 3*BVH4i::maxDepth+1);

            /* continue with closest child and push all others */
            const mic_f min_dist = set_min_lanes(tNear_pos);
            const unsigned old_sindex = sindex;
            sindex += countbits(hiti) - 1;
            assert(sindex < 3*BVH4i::maxDepth+1);

            const mic_m closest_child = eq(hitm,min_dist,tNear);
            const unsigned long closest_child_pos = bitscan64(closest_child);
            const mic_m m_pos = andn(hitm,andn(closest_child,(mic_m)((unsigned int)closest_child - 1)));
            const mic_i plower_node = load16i((int*)plower);
            curNode = ((unsigned int*)plower)[closest_child_pos];

        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) break;

        /* intersect one ray against four triangles */


        const Triangle1* tptr  = (Triangle1*) curNode.leaf(accel);
        prefetch<PFHINT_L1>(tptr + 3);
        prefetch<PFHINT_L1>(tptr + 2);
        prefetch<PFHINT_L1>(tptr + 1);
        prefetch<PFHINT_L1>(tptr + 0);

        const mic_i and_mask = broadcast4to16i(zlc4);

        const mic_f v0 = gather_4f_zlc(and_mask,

        const mic_f v1 = gather_4f_zlc(and_mask,

        const mic_f v2 = gather_4f_zlc(and_mask,

        const mic_f e1 = v1 - v0;
        const mic_f e2 = v0 - v2;
        const mic_f normal = lcross_zxy(e1,e2);
        const mic_f org = v0 - org_xyz;
        const mic_f odzxy = msubr231(org * swizzle(dir_xyz,_MM_SWIZ_REG_DACB), dir_xyz, swizzle(org,_MM_SWIZ_REG_DACB));
        const mic_f den = ldot3_zxy(dir_xyz,normal);
        const mic_f rcp_den = rcp(den);
        const mic_f uu = ldot3_zxy(e2,odzxy);
        const mic_f vv = ldot3_zxy(e1,odzxy);
        const mic_f u = uu * rcp_den;
        const mic_f v = vv * rcp_den;

#if defined(__BACKFACE_CULLING__)
        const mic_m m_init = (mic_m)0x1111 & (den > zero);
        const mic_m m_init = 0x1111;
        const mic_m valid_u = ge(m_init,u,zero);
        const mic_m valid_v = ge(valid_u,v,zero);
        const mic_m m_aperture = le(valid_v,u+v,mic_f::one());

        const mic_f nom = ldot3_zxy(org,normal);
        const mic_f t = rcp_den*nom;

        if (unlikely(none(m_aperture))) continue;

        mic_m m_final  = lt(lt(m_aperture,min_dist_xyz,t),t,max_dist_xyz);

#if defined(__USE_RAY_MASK__)
        const mic_i rayMask(ray.mask);
        const mic_i triMask = swDDDD(gather16i_4i_align(&tptr[0].v2,&tptr[1].v2,&tptr[2].v2,&tptr[3].v2));
        const mic_m m_ray_mask = (rayMask & triMask) != mic_i::zero();
        m_final &= m_ray_mask;

#if defined(__INTERSECTION_FILTER__)

        /* did the ray hit one of the four triangles? */
        while (any(m_final))
            const mic_f temp_t  = select(m_final,t,max_dist_xyz);
            const mic_f min_dist = vreduce_min(temp_t);
            const mic_m m_dist = eq(min_dist,temp_t);
            const size_t vecIndex = bitscan(toInt(m_dist));
            const size_t triIndex = vecIndex >> 2;
            const Triangle1  *__restrict__ tri_ptr = tptr + triIndex;
            const mic_m m_tri = m_dist^(m_dist & (mic_m)((unsigned int)m_dist - 1));
            const mic_f gnormalx = mic_f(tri_ptr->Ng.x);
            const mic_f gnormaly = mic_f(tri_ptr->Ng.y);
            const mic_f gnormalz = mic_f(tri_ptr->Ng.z);
            const int geomID = tri_ptr->geomID();
            const int primID = tri_ptr->primID();
            Geometry* geom = ((Scene*)bvh->geometry)->get(geomID);

            if (likely(!geom->hasOcclusionFilter1())) break;

            if (runOcclusionFilter1(geom,ray,u,v,min_dist,gnormalx,gnormaly,gnormalz,m_tri,geomID,primID))

            m_final ^= m_tri; /* clear bit */

        if (unlikely(any(m_final)))
            ray.geomID = 0;

    void BVH4mbIntersector16Single::occluded(mic_i* valid_i, BVH4mb* bvh, Ray16& ray16)
      /* near and node stack */
      __align(64) NodeRef stack_node[3*BVH4i::maxDepth+1];

      /* setup */
      const mic_m m_valid     = *(mic_i*)valid_i != mic_i(0);
      const mic3f rdir16      = rcp_safe(ray16.dir);
      unsigned int terminated = toInt(!m_valid);
      const mic_f inf         = mic_f(pos_inf);
      const mic_f zero        = mic_f::zero();

      const Node               * __restrict__ nodes = (Node     *)bvh->nodePtr();
      const BVH4mb::Triangle01 * __restrict__ accel = (BVH4mb::Triangle01 *)bvh->triPtr();

      stack_node[0] = BVH4i::invalidNode;

      long rayIndex = -1;
      while((rayIndex = bitscan64(rayIndex,toInt(m_valid))) != BITSCAN_NO_BIT_SET_64)	    
	  stack_node[1] = bvh->root;
	  size_t sindex = 2;

	  const mic_f org_xyz      = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z);
	  const mic_f dir_xyz      = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z);
	  const mic_f rdir_xyz     = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z);
	  const mic_f org_rdir_xyz = org_xyz * rdir_xyz;
	  const mic_f min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]);
	  const mic_f max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]);
	  const mic_f time         = broadcast1to16f(&ray16.time[rayIndex]);

	  const unsigned int leaf_mask = BVH4I_LEAF_MASK;

	  while (1)
	      NodeRef curNode = stack_node[sindex-1];

	      const mic_f one_time = (mic_f::one() - time);
	      while (1) 
		  /* test if this is a leaf node */
		  if (unlikely(curNode.isLeaf(leaf_mask))) break;
		  const Node* __restrict__ const node = curNode.node(nodes);
		  const float* __restrict const plower = (float*)node->lower;
		  const float* __restrict const pupper = (float*)node->upper;

		  prefetch<PFHINT_L1>((char*)node + 0*64);
		  prefetch<PFHINT_L1>((char*)node + 1*64);
		  prefetch<PFHINT_L1>((char*)node + 2*64);
		  prefetch<PFHINT_L1>((char*)node + 3*64);

		  const BVH4mb::Node* __restrict__ const nodeMB = (BVH4mb::Node*)node;
		  const mic_f lower = one_time  * load16f((float*)nodeMB->lower) + time * load16f((float*)nodeMB->lower_t1);
		  const mic_f upper = one_time  * load16f((float*)nodeMB->upper) + time * load16f((float*)nodeMB->upper_t1);
		  /* intersect single ray with 4 bounding boxes */
		  const mic_f tLowerXYZ = lower * rdir_xyz - org_rdir_xyz;
		  const mic_f tUpperXYZ = upper * rdir_xyz - org_rdir_xyz;

		  const mic_f tLower = mask_min(0x7777,min_dist_xyz,tLowerXYZ,tUpperXYZ);
		  const mic_f tUpper = mask_max(0x7777,max_dist_xyz,tLowerXYZ,tUpperXYZ);

		  const Node* __restrict__ const next = curNode.node(nodes);
		  prefetch<PFHINT_L2>((char*)next + 0);
		  prefetch<PFHINT_L2>((char*)next + 64);

		  const mic_f tNear = vreduce_max4(tLower);
		  const mic_f tFar  = vreduce_min4(tUpper);  
		  const mic_m hitm = le(0x8888,tNear,tFar);
		  const mic_f tNear_pos = select(hitm,tNear,inf);

		  curNode = stack_node[sindex]; // early pop of next node

		  /* if no child is hit, continue with early popped child */
		  if (unlikely(none(hitm))) continue;
		  const unsigned long hiti = toInt(hitm);
		  const unsigned long pos_first = bitscan64(hiti);
		  const unsigned long num_hitm = countbits(hiti); 
		  /* if a single child is hit, continue with that child */
		  curNode = ((unsigned int *)plower)[pos_first];
		  if (likely(num_hitm == 1)) continue;
		  /* if two children are hit, push in correct order */
		  const unsigned long pos_second = bitscan64(pos_first,hiti);
		  if (likely(num_hitm == 2))
		      const unsigned int dist_first  = ((unsigned int*)&tNear)[pos_first];
		      const unsigned int dist_second = ((unsigned int*)&tNear)[pos_second];
		      const unsigned int node_first  = curNode;
		      const unsigned int node_second = ((unsigned int*)plower)[pos_second];
		      if (dist_first <= dist_second)
			  stack_node[sindex] = node_second;
			  assert(sindex < 3*BVH4i::maxDepth+1);
			  stack_node[sindex] = curNode;
			  curNode = node_second;
			  assert(sindex < 3*BVH4i::maxDepth+1);
		  /* continue with closest child and push all others */
		  const mic_f min_dist = set_min_lanes(tNear_pos);
		  const unsigned int old_sindex = sindex;
		  sindex += countbits(hiti) - 1;
		  assert(sindex < 3*BVH4i::maxDepth+1);
		  const mic_m closest_child = eq(hitm,min_dist,tNear);
		  const unsigned long closest_child_pos = bitscan64(closest_child);
		  const mic_m m_pos = andn(hitm,andn(closest_child,(mic_m)((unsigned int)closest_child - 1)));
		  const mic_i plower_node = load16i((int*)plower);
		  curNode = ((unsigned int*)plower)[closest_child_pos];

	      /* return if stack is empty */
	      if (unlikely(curNode == BVH4i::invalidNode)) break;

	      /* intersect one ray against four triangles */


	      const BVH4mb::Triangle01* tptr  = (BVH4mb::Triangle01*) curNode.leaf(accel);

	      prefetch<PFHINT_L1>((mic_f*)tptr +  0); 
	      prefetch<PFHINT_L1>((mic_f*)tptr +  1); 
	      prefetch<PFHINT_L1>((mic_f*)tptr +  2); 
	      prefetch<PFHINT_L1>((mic_f*)tptr +  3); 

	      const mic_i and_mask = broadcast4to16i(zlc4);
	      const mic_f v0_t0 = gather_4f_zlc(and_mask,
	      const mic_f v1_t0 = gather_4f_zlc(and_mask,
	      const mic_f v2_t0 = gather_4f_zlc(and_mask,

	      prefetch<PFHINT_L2>((mic_f*)tptr +  4); 
	      prefetch<PFHINT_L2>((mic_f*)tptr +  5); 
	      prefetch<PFHINT_L2>((mic_f*)tptr +  6); 
	      prefetch<PFHINT_L2>((mic_f*)tptr +  7); 

	      const mic_f v0_t1 = gather_4f_zlc(and_mask,
	      const mic_f v1_t1 = gather_4f_zlc(and_mask,
	      const mic_f v2_t1 = gather_4f_zlc(and_mask,

	      const mic_f v0 = v0_t0 * one_time + time * v0_t1;
	      const mic_f v1 = v1_t0 * one_time + time * v1_t1;
	      const mic_f v2 = v2_t0 * one_time + time * v2_t1;

	      const mic_f e1 = v1 - v0;
	      const mic_f e2 = v0 - v2;	     
	      const mic_f normal = lcross_zxy(e1,e2);
	      const mic_f org = v0 - org_xyz;
	      const mic_f odzxy = msubr231(org * swizzle(dir_xyz,_MM_SWIZ_REG_DACB), dir_xyz, swizzle(org,_MM_SWIZ_REG_DACB));
	      const mic_f den = ldot3_zxy(dir_xyz,normal);	      
	      const mic_f rcp_den = rcp(den);
	      const mic_f uu = ldot3_zxy(e2,odzxy); 
	      const mic_f vv = ldot3_zxy(e1,odzxy); 
	      const mic_f u = uu * rcp_den;
	      const mic_f v = vv * rcp_den;

#if defined(__BACKFACE_CULLING__)
	      const mic_m m_init = (mic_m)0x1111 & (den > zero);
	      const mic_m m_init = 0x1111;

	      const mic_m valid_u = ge((mic_m)m_init,u,zero);
	      const mic_m valid_v = ge(valid_u,v,zero);
	      const mic_m m_aperture = le(valid_v,u+v,mic_f::one()); 

	      const mic_f nom = ldot3_zxy(org,normal);
	      const mic_f t = rcp_den*nom;
	      if (unlikely(none(m_aperture))) continue;

	      mic_m m_final  = lt(lt(m_aperture,min_dist_xyz,t),t,max_dist_xyz);

#if defined(__USE_RAY_MASK__)
	      const mic_i rayMask(ray16.mask[rayIndex]);
	      const mic_i triMask = swDDDD(gather16i_4i_align(&tptr[0].t0.v2,&tptr[1].t0.v2,&tptr[2].t0.v2,&tptr[3].t0.v2));
	      const mic_m m_ray_mask = (rayMask & triMask) != mic_i::zero();
	      m_final &= m_ray_mask;	      

	      if (unlikely(any(m_final)))
		  terminated |= mic_m::shift1[rayIndex];


	  if (unlikely(all(toMask(terminated)))) break;

      store16i(m_valid & toMask(terminated),&ray16.geomID,0);