Example #1
0
void BVH4iIntersector1::occluded(BVH4i* bvh, Ray& ray)
{
    /* near and node stack */
    __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1];

    /* setup */
    const mic3f rdir16      = rcp_safe(mic3f(ray.dir.x,ray.dir.y,ray.dir.z));
    const mic_f inf         = mic_f(pos_inf);
    const mic_f zero        = mic_f::zero();

    const Node      * __restrict__ nodes = (Node     *)bvh->nodePtr();
    const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr();

    stack_node[0] = BVH4i::invalidNode;
    stack_node[1] = bvh->root;
    size_t sindex = 2;

    const mic_f org_xyz      = loadAOS4to16f(ray.org.x,ray.org.y,ray.org.z);
    const mic_f dir_xyz      = loadAOS4to16f(ray.dir.x,ray.dir.y,ray.dir.z);
    const mic_f rdir_xyz     = loadAOS4to16f(rdir16.x[0],rdir16.y[0],rdir16.z[0]);
    const mic_f org_rdir_xyz = org_xyz * rdir_xyz;
    const mic_f min_dist_xyz = broadcast1to16f(&ray.tnear);
    const mic_f max_dist_xyz = broadcast1to16f(&ray.tfar);

    const unsigned int leaf_mask = BVH4I_LEAF_MASK;

    while (1)
    {
        NodeRef curNode = stack_node[sindex-1];
        sindex--;

        while (1)
        {
            /* test if this is a leaf node */
            if (unlikely(curNode.isLeaf(leaf_mask))) break;

            const Node* __restrict__ const node = curNode.node(nodes);
            const float* __restrict const plower = (float*)node->lower;
            const float* __restrict const pupper = (float*)node->upper;

            prefetch<PFHINT_L1>((char*)node + 0);
            prefetch<PFHINT_L1>((char*)node + 64);

            /* intersect single ray with 4 bounding boxes */
            const mic_f tLowerXYZ = load16f(plower) * rdir_xyz - org_rdir_xyz;
            const mic_f tUpperXYZ = load16f(pupper) * rdir_xyz - org_rdir_xyz;
            const mic_f tLower = mask_min(0x7777,min_dist_xyz,tLowerXYZ,tUpperXYZ);
            const mic_f tUpper = mask_max(0x7777,max_dist_xyz,tLowerXYZ,tUpperXYZ);

            sindex--;
            curNode = stack_node[sindex];

            const Node* __restrict__ const next = curNode.node(nodes);
            prefetch<PFHINT_L2>((char*)next + 0);
            prefetch<PFHINT_L2>((char*)next + 64);

            const mic_f tNear = vreduce_max4(tLower);
            const mic_f tFar  = vreduce_min4(tUpper);
            const mic_m hitm = le(0x8888,tNear,tFar);
            const mic_f tNear_pos = select(hitm,tNear,inf);


            /* if no child is hit, continue with early popped child */
            if (unlikely(none(hitm))) continue;
            sindex++;

            const unsigned long hiti = toInt(hitm);
            const unsigned long pos_first = bitscan64(hiti);
            const unsigned long num_hitm = countbits(hiti);

            /* if a single child is hit, continue with that child */
            curNode = ((unsigned int *)plower)[pos_first];
            if (likely(num_hitm == 1)) continue;

            /* if two children are hit, push in correct order */
            const unsigned long pos_second = bitscan64(pos_first,hiti);
            if (likely(num_hitm == 2))
            {
                const unsigned int dist_first  = ((unsigned int*)&tNear)[pos_first];
                const unsigned int dist_second = ((unsigned int*)&tNear)[pos_second];
                const unsigned int node_first  = curNode;
                const unsigned int node_second = ((unsigned int*)plower)[pos_second];

                if (dist_first <= dist_second)
                {
                    stack_node[sindex] = node_second;
                    sindex++;
                    assert(sindex < 3*BVH4i::maxDepth+1);
                    continue;
                }
                else
                {
                    stack_node[sindex] = curNode;
                    curNode = node_second;
                    sindex++;
                    assert(sindex < 3*BVH4i::maxDepth+1);
                    continue;
                }
            }

            /* continue with closest child and push all others */
            const mic_f min_dist = set_min_lanes(tNear_pos);
            const unsigned old_sindex = sindex;
            sindex += countbits(hiti) - 1;
            assert(sindex < 3*BVH4i::maxDepth+1);

            const mic_m closest_child = eq(hitm,min_dist,tNear);
            const unsigned long closest_child_pos = bitscan64(closest_child);
            const mic_m m_pos = andn(hitm,andn(closest_child,(mic_m)((unsigned int)closest_child - 1)));
            const mic_i plower_node = load16i((int*)plower);
            curNode = ((unsigned int*)plower)[closest_child_pos];
            compactustore16i(m_pos,&stack_node[old_sindex],plower_node);
        }



        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) break;


        /* intersect one ray against four triangles */

        //////////////////////////////////////////////////////////////////////////////////////////////////

        const Triangle1* tptr  = (Triangle1*) curNode.leaf(accel);
        prefetch<PFHINT_L1>(tptr + 3);
        prefetch<PFHINT_L1>(tptr + 2);
        prefetch<PFHINT_L1>(tptr + 1);
        prefetch<PFHINT_L1>(tptr + 0);

        const mic_i and_mask = broadcast4to16i(zlc4);

        const mic_f v0 = gather_4f_zlc(and_mask,
                                       (float*)&tptr[0].v0,
                                       (float*)&tptr[1].v0,
                                       (float*)&tptr[2].v0,
                                       (float*)&tptr[3].v0);

        const mic_f v1 = gather_4f_zlc(and_mask,
                                       (float*)&tptr[0].v1,
                                       (float*)&tptr[1].v1,
                                       (float*)&tptr[2].v1,
                                       (float*)&tptr[3].v1);

        const mic_f v2 = gather_4f_zlc(and_mask,
                                       (float*)&tptr[0].v2,
                                       (float*)&tptr[1].v2,
                                       (float*)&tptr[2].v2,
                                       (float*)&tptr[3].v2);

        const mic_f e1 = v1 - v0;
        const mic_f e2 = v0 - v2;
        const mic_f normal = lcross_zxy(e1,e2);
        const mic_f org = v0 - org_xyz;
        const mic_f odzxy = msubr231(org * swizzle(dir_xyz,_MM_SWIZ_REG_DACB), dir_xyz, swizzle(org,_MM_SWIZ_REG_DACB));
        const mic_f den = ldot3_zxy(dir_xyz,normal);
        const mic_f rcp_den = rcp(den);
        const mic_f uu = ldot3_zxy(e2,odzxy);
        const mic_f vv = ldot3_zxy(e1,odzxy);
        const mic_f u = uu * rcp_den;
        const mic_f v = vv * rcp_den;

#if defined(__BACKFACE_CULLING__)
        const mic_m m_init = (mic_m)0x1111 & (den > zero);
#else
        const mic_m m_init = 0x1111;
#endif
        const mic_m valid_u = ge(m_init,u,zero);
        const mic_m valid_v = ge(valid_u,v,zero);
        const mic_m m_aperture = le(valid_v,u+v,mic_f::one());

        const mic_f nom = ldot3_zxy(org,normal);
        const mic_f t = rcp_den*nom;

        if (unlikely(none(m_aperture))) continue;

        mic_m m_final  = lt(lt(m_aperture,min_dist_xyz,t),t,max_dist_xyz);

#if defined(__USE_RAY_MASK__)
        const mic_i rayMask(ray.mask);
        const mic_i triMask = swDDDD(gather16i_4i_align(&tptr[0].v2,&tptr[1].v2,&tptr[2].v2,&tptr[3].v2));
        const mic_m m_ray_mask = (rayMask & triMask) != mic_i::zero();
        m_final &= m_ray_mask;
#endif

#if defined(__INTERSECTION_FILTER__)

        /* did the ray hit one of the four triangles? */
        while (any(m_final))
        {
            const mic_f temp_t  = select(m_final,t,max_dist_xyz);
            const mic_f min_dist = vreduce_min(temp_t);
            const mic_m m_dist = eq(min_dist,temp_t);
            const size_t vecIndex = bitscan(toInt(m_dist));
            const size_t triIndex = vecIndex >> 2;
            const Triangle1  *__restrict__ tri_ptr = tptr + triIndex;
            const mic_m m_tri = m_dist^(m_dist & (mic_m)((unsigned int)m_dist - 1));
            const mic_f gnormalx = mic_f(tri_ptr->Ng.x);
            const mic_f gnormaly = mic_f(tri_ptr->Ng.y);
            const mic_f gnormalz = mic_f(tri_ptr->Ng.z);
            const int geomID = tri_ptr->geomID();
            const int primID = tri_ptr->primID();
            Geometry* geom = ((Scene*)bvh->geometry)->get(geomID);

            if (likely(!geom->hasOcclusionFilter1())) break;

            if (runOcclusionFilter1(geom,ray,u,v,min_dist,gnormalx,gnormaly,gnormalz,m_tri,geomID,primID))
                break;

            m_final ^= m_tri; /* clear bit */
        }
#endif

        if (unlikely(any(m_final)))
        {
            ray.geomID = 0;
            return;
        }
        //////////////////////////////////////////////////////////////////////////////////////////////////

    }
}
Example #2
0
  size_t BVH4MB::rotate(Base* nodeID, size_t depth)
  {
    /*! nothing to rotate if we reached a leaf node. */
    if (nodeID->isLeaf()) return 0;
    Node* parent = nodeID->node();

    /*! rotate all children first */
    ssei cdepth;
    for (size_t c=0; c<4; c++)
      cdepth[c] = (int)rotate(parent->child[c],depth+1);

    /* compute current area of all children */
    ssef sizeX = parent->upper_x-parent->lower_x;
    ssef sizeY = parent->upper_y-parent->lower_y;
    ssef sizeZ = parent->upper_z-parent->lower_z;
    ssef childArea = sizeX*(sizeY + sizeZ) + sizeY*sizeZ;

    /*! transpose node bounds */
    ssef plower0,plower1,plower2,plower3; transpose(parent->lower_x,parent->lower_y,parent->lower_z,ssef(zero),plower0,plower1,plower2,plower3);
    ssef pupper0,pupper1,pupper2,pupper3; transpose(parent->upper_x,parent->upper_y,parent->upper_z,ssef(zero),pupper0,pupper1,pupper2,pupper3);
    BBox<ssef> other0(plower0,pupper0), other1(plower1,pupper1), other2(plower2,pupper2), other3(plower3,pupper3);

    /*! Find best rotation. We pick a target child of a first child,
      and swap this with an other child. We perform the best such
      swap. */
    float bestCost = pos_inf;
    int bestChild = -1, bestTarget = -1, bestOther = -1;
    for (size_t c=0; c<4; c++)
    {
      /*! ignore leaf nodes as we cannot descent into */
      if (parent->child[c]->isLeaf()) continue;
      Node* child = parent->child[c]->node();

      /*! transpose child bounds */
      ssef clower0,clower1,clower2,clower3; transpose(child->lower_x,child->lower_y,child->lower_z,ssef(zero),clower0,clower1,clower2,clower3);
      ssef cupper0,cupper1,cupper2,cupper3; transpose(child->upper_x,child->upper_y,child->upper_z,ssef(zero),cupper0,cupper1,cupper2,cupper3);
      BBox<ssef> target0(clower0,cupper0), target1(clower1,cupper1), target2(clower2,cupper2), target3(clower3,cupper3);

      /*! put other0 at each target position */
      float cost00 = halfArea3f(merge(other0 ,target1,target2,target3));
      float cost01 = halfArea3f(merge(target0,other0 ,target2,target3));
      float cost02 = halfArea3f(merge(target0,target1,other0 ,target3));
      float cost03 = halfArea3f(merge(target0,target1,target2,other0 ));
      ssef cost0 = ssef(cost00,cost01,cost02,cost03);
      ssef min0 = vreduce_min(cost0);
      int pos0 = (int)__bsf(movemask(min0 == cost0));

      /*! put other1 at each target position */
      float cost10 = halfArea3f(merge(other1 ,target1,target2,target3));
      float cost11 = halfArea3f(merge(target0,other1 ,target2,target3));
      float cost12 = halfArea3f(merge(target0,target1,other1 ,target3));
      float cost13 = halfArea3f(merge(target0,target1,target2,other1 ));
      ssef cost1 = ssef(cost10,cost11,cost12,cost13);
      ssef min1 = vreduce_min(cost1);
      int pos1 = (int)__bsf(movemask(min1 == cost1));

      /*! put other2 at each target position */
      float cost20 = halfArea3f(merge(other2 ,target1,target2,target3));
      float cost21 = halfArea3f(merge(target0,other2 ,target2,target3));
      float cost22 = halfArea3f(merge(target0,target1,other2 ,target3));
      float cost23 = halfArea3f(merge(target0,target1,target2,other2 ));
      ssef cost2 = ssef(cost20,cost21,cost22,cost23);
      ssef min2 = vreduce_min(cost2);
      int pos2 = (int)__bsf(movemask(min2 == cost2));

      /*! put other3 at each target position */
      float cost30 = halfArea3f(merge(other3 ,target1,target2,target3));
      float cost31 = halfArea3f(merge(target0,other3 ,target2,target3));
      float cost32 = halfArea3f(merge(target0,target1,other3 ,target3));
      float cost33 = halfArea3f(merge(target0,target1,target2,other3 ));
      ssef cost3 = ssef(cost30,cost31,cost32,cost33);
      ssef min3 = vreduce_min(cost3);
      int pos3 = (int)__bsf(movemask(min3 == cost3));

      /*! find best other child */
      ssef otherCost = ssef(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3));
      int pos[4] = { pos0,pos1,pos2,pos3 };
      sseb valid = ssei(int(depth+1))+cdepth <= ssei(maxDepth); // only select swaps that fulfill depth constraints
      if (none(valid)) continue;
      
      size_t n = select_min(valid,otherCost);
      float cost = otherCost[n]-childArea[c]; //< increasing the original child bound is bad, decreasing good

      /*! accept a swap when it reduces cost and is not swapping a node with itself */
      if (cost < bestCost && n != c) {
        bestCost = cost;
        bestChild = (int)c;
        bestOther = (int)n;
        bestTarget = pos[n];
      }
    }

    /*! if we did not find a swap that improves the SAH then do nothing */
    if (bestCost >= 0) return 1+reduce_max(cdepth);

    /*! perform the best found tree rotation */
    Node* child = parent->child[bestChild]->node();
    swap(parent,bestOther,child,bestTarget);
    parent->lower_x[bestChild] = reduce_min(child->lower_x);
    parent->lower_y[bestChild] = reduce_min(child->lower_y);
    parent->lower_z[bestChild] = reduce_min(child->lower_z);
    parent->upper_x[bestChild] = reduce_max(child->upper_x);
    parent->upper_y[bestChild] = reduce_max(child->upper_y);
    parent->upper_z[bestChild] = reduce_max(child->upper_z);
    parent->lower_dx[bestChild] = reduce_min(child->lower_dx);
    parent->lower_dy[bestChild] = reduce_min(child->lower_dy);
    parent->lower_dz[bestChild] = reduce_min(child->lower_dz);
    parent->upper_dx[bestChild] = reduce_max(child->upper_dx);
    parent->upper_dy[bestChild] = reduce_max(child->upper_dy);
    parent->upper_dz[bestChild] = reduce_max(child->upper_dz);

    /*! This returned depth is conservative as the child that was
     *  pulled up in the tree could have been on the critical path. */
    cdepth[bestOther]++; // bestOther was pushed down one level
    return 1+reduce_max(cdepth); 
  }
Example #3
0
  void BVH4HairBuilder::parallelBinningGlobal(const size_t threadID, const size_t numThreads)
  {
    BuildRecord &current = global_sharedData.rec;

    const unsigned int items = current.items();
    const unsigned int startID = current.begin + ((threadID+0)*items/numThreads);
    const unsigned int endID   = current.begin + ((threadID+1)*items/numThreads);

    const mic_f centroidMin = broadcast4to16f(&current.bounds.centroid2.lower);
    const mic_f centroidMax = broadcast4to16f(&current.bounds.centroid2.upper);

    const mic_f centroidBoundsMin_2 = centroidMin;
    const mic_f centroidDiagonal_2  = centroidMax-centroidMin;
    const mic_f scale = select(centroidDiagonal_2 != 0.0f,rcp(centroidDiagonal_2) * mic_f(16.0f * 0.99f),mic_f::zero());

    Bezier1i  *__restrict__ const tmp_prims = (Bezier1i*)accel;

    fastbin_copy<Bezier1i,false>(prims,tmp_prims,startID,endID,centroidBoundsMin_2,scale,global_bin16[threadID]);    

    LockStepTaskScheduler::syncThreadsWithReduction( threadID, numThreads, reduceBinsParallel, global_bin16 );
    
    if (threadID == 0)
      {
	const float voxelArea = area(current.bounds.geometry);

	global_sharedData.split.cost = items * voxelArea * INTERSECTION_COST;;
	
	const Bin16 &bin16 = global_bin16[0];

	for (size_t dim=0;dim<3;dim++)
	  {
	    if (unlikely(centroidDiagonal_2[dim] == 0.0f)) continue;

	    const mic_f rArea = prefix_area_rl(bin16.min_x[dim],bin16.min_y[dim],bin16.min_z[dim],
					       bin16.max_x[dim],bin16.max_y[dim],bin16.max_z[dim]);
	    const mic_f lArea = prefix_area_lr(bin16.min_x[dim],bin16.min_y[dim],bin16.min_z[dim],
					       bin16.max_x[dim],bin16.max_y[dim],bin16.max_z[dim]);
	    const mic_i lnum  = prefix_count(bin16.count[dim]);

	    const mic_i rnum    = mic_i(items) - lnum;
	    const mic_i lblocks = (lnum + mic_i(3)) >> 2;
	    const mic_i rblocks = (rnum + mic_i(3)) >> 2;
	    const mic_m m_lnum  = lnum == 0;
	    const mic_m m_rnum  = rnum == 0;
	    const mic_f cost    = select(m_lnum|m_rnum,mic_f::inf(),lArea * mic_f(lblocks) + rArea * mic_f(rblocks) + voxelArea );

	    if (lt(cost,mic_f(global_sharedData.split.cost)))
	      {

		const mic_f min_cost    = vreduce_min(cost); 
		const mic_m m_pos       = min_cost == cost;
		const unsigned long pos = bitscan64(m_pos);	    
		
		assert(pos < 15);
		if (pos < 15)
		  {
		    global_sharedData.split.cost    = cost[pos];
		    global_sharedData.split.pos     = pos+1;
		    global_sharedData.split.dim     = dim;	    
		    global_sharedData.split.numLeft = lnum[pos];
		  }
	      }
	  }
      }
  }
Example #4
0
    size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth)
    {
      /*! nothing to rotate if we reached a leaf node. */
      if (parentRef.isBarrier()) return 0;
      if (parentRef.isLeaf()) return 0;
      Node* parent = parentRef.node();
      
      /*! rotate all children first */
      vint4 cdepth;
      for (size_t c=0; c<4; c++)
	cdepth[c] = (int)rotate(parent->child(c),depth+1);
      
      /* compute current areas of all children */
      vfloat4 sizeX = parent->upper_x-parent->lower_x;
      vfloat4 sizeY = parent->upper_y-parent->lower_y;
      vfloat4 sizeZ = parent->upper_z-parent->lower_z;
      vfloat4 childArea = sizeX*(sizeY + sizeZ) + sizeY*sizeZ;
      
      /*! get node bounds */
      BBox<vfloat4> child1_0,child1_1,child1_2,child1_3;
      parent->bounds(child1_0,child1_1,child1_2,child1_3);
      
      /*! Find best rotation. We pick a first child (child1) and a sub-child 
	(child2child) of a different second child (child2), and swap child1 
	and child2child. We perform the best such swap. */
      float bestArea = 0;
      size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1;
      for (size_t c2=0; c2<4; c2++)
      {
	/*! ignore leaf nodes as we cannot descent into them */
	if (parent->child(c2).isBarrier()) continue;
	if (parent->child(c2).isLeaf()) continue;
	Node* child2 = parent->child(c2).node();
	
	/*! transpose child bounds */
	BBox<vfloat4> child2c0,child2c1,child2c2,child2c3;
	child2->bounds(child2c0,child2c1,child2c2,child2c3);
	
	/*! put child1_0 at each child2 position */
	float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3));
	float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3));
	float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3));
	float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0));
	vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03);
	vfloat4 min0 = vreduce_min(cost0);
	int pos0 = (int)__bsf(movemask(min0 == cost0));
	
	/*! put child1_1 at each child2 position */
	float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3));
	float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3));
	float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3));
	float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1));
	vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13);
	vfloat4 min1 = vreduce_min(cost1);
	int pos1 = (int)__bsf(movemask(min1 == cost1));
	
	/*! put child1_2 at each child2 position */
	float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3));
	float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3));
	float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3));
	float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2));
	vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23);
	vfloat4 min2 = vreduce_min(cost2);
	int pos2 = (int)__bsf(movemask(min2 == cost2));
	
	/*! put child1_3 at each child2 position */
	float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3));
	float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3));
	float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3));
	float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3));
	vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33);
	vfloat4 min3 = vreduce_min(cost3);
	int pos3 = (int)__bsf(movemask(min3 == cost3));
	
	/*! find best other child */
	vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]);
	int pos[4] = { pos0,pos1,pos2,pos3 };
	const size_t mbd = BVH4::maxBuildDepth;
	vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints
	valid &= vint4(c2) != vint4(step);
	if (none(valid)) continue;
	size_t c1 = select_min(valid,area0123);
	float area = area0123[c1]; 
        if (c1 == c2) continue; // can happen if bounds are NANs
	
	/*! accept a swap when it reduces cost and is not swapping a node with itself */
	if (area < bestArea) {
	  bestArea = area;
	  bestChild1 = c1;
	  bestChild2 = c2;
	  bestChild2Child = pos[c1];
	}
      }
      
      /*! if we did not find a swap that improves the SAH then do nothing */
      if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth);
      
      /*! perform the best found tree rotation */
      Node* child2 = parent->child(bestChild2).node();
      BVH4::swap(parent,bestChild1,child2,bestChild2Child);
      parent->set(bestChild2,child2->bounds());
      BVH4::compact(parent);
      BVH4::compact(child2);
      
      /*! This returned depth is conservative as the child that was
       *  pulled up in the tree could have been on the critical path. */
      cdepth[bestChild1]++; // bestChild1 was pushed down one level
      return 1+reduce_max(cdepth); 
    }