void BVH4iIntersector1::occluded(BVH4i* bvh, Ray& ray) { /* near and node stack */ __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; /* setup */ const mic3f rdir16 = rcp_safe(mic3f(ray.dir.x,ray.dir.y,ray.dir.z)); const mic_f inf = mic_f(pos_inf); const mic_f zero = mic_f::zero(); const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr(); stack_node[0] = BVH4i::invalidNode; stack_node[1] = bvh->root; size_t sindex = 2; const mic_f org_xyz = loadAOS4to16f(ray.org.x,ray.org.y,ray.org.z); const mic_f dir_xyz = loadAOS4to16f(ray.dir.x,ray.dir.y,ray.dir.z); const mic_f rdir_xyz = loadAOS4to16f(rdir16.x[0],rdir16.y[0],rdir16.z[0]); const mic_f org_rdir_xyz = org_xyz * rdir_xyz; const mic_f min_dist_xyz = broadcast1to16f(&ray.tnear); const mic_f max_dist_xyz = broadcast1to16f(&ray.tfar); const unsigned int leaf_mask = BVH4I_LEAF_MASK; while (1) { NodeRef curNode = stack_node[sindex-1]; sindex--; while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf(leaf_mask))) break; const Node* __restrict__ const node = curNode.node(nodes); const float* __restrict const plower = (float*)node->lower; const float* __restrict const pupper = (float*)node->upper; prefetch<PFHINT_L1>((char*)node + 0); prefetch<PFHINT_L1>((char*)node + 64); /* intersect single ray with 4 bounding boxes */ const mic_f tLowerXYZ = load16f(plower) * rdir_xyz - org_rdir_xyz; const mic_f tUpperXYZ = load16f(pupper) * rdir_xyz - org_rdir_xyz; const mic_f tLower = mask_min(0x7777,min_dist_xyz,tLowerXYZ,tUpperXYZ); const mic_f tUpper = mask_max(0x7777,max_dist_xyz,tLowerXYZ,tUpperXYZ); sindex--; curNode = stack_node[sindex]; const Node* __restrict__ const next = curNode.node(nodes); prefetch<PFHINT_L2>((char*)next + 0); prefetch<PFHINT_L2>((char*)next + 64); const mic_f tNear = vreduce_max4(tLower); const mic_f tFar = vreduce_min4(tUpper); const mic_m hitm = le(0x8888,tNear,tFar); const mic_f tNear_pos = select(hitm,tNear,inf); /* if no child is hit, continue with early popped child */ if (unlikely(none(hitm))) continue; sindex++; const unsigned long hiti = toInt(hitm); const unsigned long pos_first = bitscan64(hiti); const unsigned long num_hitm = countbits(hiti); /* if a single child is hit, continue with that child */ curNode = ((unsigned int *)plower)[pos_first]; if (likely(num_hitm == 1)) continue; /* if two children are hit, push in correct order */ const unsigned long pos_second = bitscan64(pos_first,hiti); if (likely(num_hitm == 2)) { const unsigned int dist_first = ((unsigned int*)&tNear)[pos_first]; const unsigned int dist_second = ((unsigned int*)&tNear)[pos_second]; const unsigned int node_first = curNode; const unsigned int node_second = ((unsigned int*)plower)[pos_second]; if (dist_first <= dist_second) { stack_node[sindex] = node_second; sindex++; assert(sindex < 3*BVH4i::maxDepth+1); continue; } else { stack_node[sindex] = curNode; curNode = node_second; sindex++; assert(sindex < 3*BVH4i::maxDepth+1); continue; } } /* continue with closest child and push all others */ const mic_f min_dist = set_min_lanes(tNear_pos); const unsigned old_sindex = sindex; sindex += countbits(hiti) - 1; assert(sindex < 3*BVH4i::maxDepth+1); const mic_m closest_child = eq(hitm,min_dist,tNear); const unsigned long closest_child_pos = bitscan64(closest_child); const mic_m m_pos = andn(hitm,andn(closest_child,(mic_m)((unsigned int)closest_child - 1))); const mic_i plower_node = load16i((int*)plower); curNode = ((unsigned int*)plower)[closest_child_pos]; compactustore16i(m_pos,&stack_node[old_sindex],plower_node); } /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect one ray against four triangles */ ////////////////////////////////////////////////////////////////////////////////////////////////// const Triangle1* tptr = (Triangle1*) curNode.leaf(accel); prefetch<PFHINT_L1>(tptr + 3); prefetch<PFHINT_L1>(tptr + 2); prefetch<PFHINT_L1>(tptr + 1); prefetch<PFHINT_L1>(tptr + 0); const mic_i and_mask = broadcast4to16i(zlc4); const mic_f v0 = gather_4f_zlc(and_mask, (float*)&tptr[0].v0, (float*)&tptr[1].v0, (float*)&tptr[2].v0, (float*)&tptr[3].v0); const mic_f v1 = gather_4f_zlc(and_mask, (float*)&tptr[0].v1, (float*)&tptr[1].v1, (float*)&tptr[2].v1, (float*)&tptr[3].v1); const mic_f v2 = gather_4f_zlc(and_mask, (float*)&tptr[0].v2, (float*)&tptr[1].v2, (float*)&tptr[2].v2, (float*)&tptr[3].v2); const mic_f e1 = v1 - v0; const mic_f e2 = v0 - v2; const mic_f normal = lcross_zxy(e1,e2); const mic_f org = v0 - org_xyz; const mic_f odzxy = msubr231(org * swizzle(dir_xyz,_MM_SWIZ_REG_DACB), dir_xyz, swizzle(org,_MM_SWIZ_REG_DACB)); const mic_f den = ldot3_zxy(dir_xyz,normal); const mic_f rcp_den = rcp(den); const mic_f uu = ldot3_zxy(e2,odzxy); const mic_f vv = ldot3_zxy(e1,odzxy); const mic_f u = uu * rcp_den; const mic_f v = vv * rcp_den; #if defined(__BACKFACE_CULLING__) const mic_m m_init = (mic_m)0x1111 & (den > zero); #else const mic_m m_init = 0x1111; #endif const mic_m valid_u = ge(m_init,u,zero); const mic_m valid_v = ge(valid_u,v,zero); const mic_m m_aperture = le(valid_v,u+v,mic_f::one()); const mic_f nom = ldot3_zxy(org,normal); const mic_f t = rcp_den*nom; if (unlikely(none(m_aperture))) continue; mic_m m_final = lt(lt(m_aperture,min_dist_xyz,t),t,max_dist_xyz); #if defined(__USE_RAY_MASK__) const mic_i rayMask(ray.mask); const mic_i triMask = swDDDD(gather16i_4i_align(&tptr[0].v2,&tptr[1].v2,&tptr[2].v2,&tptr[3].v2)); const mic_m m_ray_mask = (rayMask & triMask) != mic_i::zero(); m_final &= m_ray_mask; #endif #if defined(__INTERSECTION_FILTER__) /* did the ray hit one of the four triangles? */ while (any(m_final)) { const mic_f temp_t = select(m_final,t,max_dist_xyz); const mic_f min_dist = vreduce_min(temp_t); const mic_m m_dist = eq(min_dist,temp_t); const size_t vecIndex = bitscan(toInt(m_dist)); const size_t triIndex = vecIndex >> 2; const Triangle1 *__restrict__ tri_ptr = tptr + triIndex; const mic_m m_tri = m_dist^(m_dist & (mic_m)((unsigned int)m_dist - 1)); const mic_f gnormalx = mic_f(tri_ptr->Ng.x); const mic_f gnormaly = mic_f(tri_ptr->Ng.y); const mic_f gnormalz = mic_f(tri_ptr->Ng.z); const int geomID = tri_ptr->geomID(); const int primID = tri_ptr->primID(); Geometry* geom = ((Scene*)bvh->geometry)->get(geomID); if (likely(!geom->hasOcclusionFilter1())) break; if (runOcclusionFilter1(geom,ray,u,v,min_dist,gnormalx,gnormaly,gnormalz,m_tri,geomID,primID)) break; m_final ^= m_tri; /* clear bit */ } #endif if (unlikely(any(m_final))) { ray.geomID = 0; return; } ////////////////////////////////////////////////////////////////////////////////////////////////// } }
size_t BVH4MB::rotate(Base* nodeID, size_t depth) { /*! nothing to rotate if we reached a leaf node. */ if (nodeID->isLeaf()) return 0; Node* parent = nodeID->node(); /*! rotate all children first */ ssei cdepth; for (size_t c=0; c<4; c++) cdepth[c] = (int)rotate(parent->child[c],depth+1); /* compute current area of all children */ ssef sizeX = parent->upper_x-parent->lower_x; ssef sizeY = parent->upper_y-parent->lower_y; ssef sizeZ = parent->upper_z-parent->lower_z; ssef childArea = sizeX*(sizeY + sizeZ) + sizeY*sizeZ; /*! transpose node bounds */ ssef plower0,plower1,plower2,plower3; transpose(parent->lower_x,parent->lower_y,parent->lower_z,ssef(zero),plower0,plower1,plower2,plower3); ssef pupper0,pupper1,pupper2,pupper3; transpose(parent->upper_x,parent->upper_y,parent->upper_z,ssef(zero),pupper0,pupper1,pupper2,pupper3); BBox<ssef> other0(plower0,pupper0), other1(plower1,pupper1), other2(plower2,pupper2), other3(plower3,pupper3); /*! Find best rotation. We pick a target child of a first child, and swap this with an other child. We perform the best such swap. */ float bestCost = pos_inf; int bestChild = -1, bestTarget = -1, bestOther = -1; for (size_t c=0; c<4; c++) { /*! ignore leaf nodes as we cannot descent into */ if (parent->child[c]->isLeaf()) continue; Node* child = parent->child[c]->node(); /*! transpose child bounds */ ssef clower0,clower1,clower2,clower3; transpose(child->lower_x,child->lower_y,child->lower_z,ssef(zero),clower0,clower1,clower2,clower3); ssef cupper0,cupper1,cupper2,cupper3; transpose(child->upper_x,child->upper_y,child->upper_z,ssef(zero),cupper0,cupper1,cupper2,cupper3); BBox<ssef> target0(clower0,cupper0), target1(clower1,cupper1), target2(clower2,cupper2), target3(clower3,cupper3); /*! put other0 at each target position */ float cost00 = halfArea3f(merge(other0 ,target1,target2,target3)); float cost01 = halfArea3f(merge(target0,other0 ,target2,target3)); float cost02 = halfArea3f(merge(target0,target1,other0 ,target3)); float cost03 = halfArea3f(merge(target0,target1,target2,other0 )); ssef cost0 = ssef(cost00,cost01,cost02,cost03); ssef min0 = vreduce_min(cost0); int pos0 = (int)__bsf(movemask(min0 == cost0)); /*! put other1 at each target position */ float cost10 = halfArea3f(merge(other1 ,target1,target2,target3)); float cost11 = halfArea3f(merge(target0,other1 ,target2,target3)); float cost12 = halfArea3f(merge(target0,target1,other1 ,target3)); float cost13 = halfArea3f(merge(target0,target1,target2,other1 )); ssef cost1 = ssef(cost10,cost11,cost12,cost13); ssef min1 = vreduce_min(cost1); int pos1 = (int)__bsf(movemask(min1 == cost1)); /*! put other2 at each target position */ float cost20 = halfArea3f(merge(other2 ,target1,target2,target3)); float cost21 = halfArea3f(merge(target0,other2 ,target2,target3)); float cost22 = halfArea3f(merge(target0,target1,other2 ,target3)); float cost23 = halfArea3f(merge(target0,target1,target2,other2 )); ssef cost2 = ssef(cost20,cost21,cost22,cost23); ssef min2 = vreduce_min(cost2); int pos2 = (int)__bsf(movemask(min2 == cost2)); /*! put other3 at each target position */ float cost30 = halfArea3f(merge(other3 ,target1,target2,target3)); float cost31 = halfArea3f(merge(target0,other3 ,target2,target3)); float cost32 = halfArea3f(merge(target0,target1,other3 ,target3)); float cost33 = halfArea3f(merge(target0,target1,target2,other3 )); ssef cost3 = ssef(cost30,cost31,cost32,cost33); ssef min3 = vreduce_min(cost3); int pos3 = (int)__bsf(movemask(min3 == cost3)); /*! find best other child */ ssef otherCost = ssef(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)); int pos[4] = { pos0,pos1,pos2,pos3 }; sseb valid = ssei(int(depth+1))+cdepth <= ssei(maxDepth); // only select swaps that fulfill depth constraints if (none(valid)) continue; size_t n = select_min(valid,otherCost); float cost = otherCost[n]-childArea[c]; //< increasing the original child bound is bad, decreasing good /*! accept a swap when it reduces cost and is not swapping a node with itself */ if (cost < bestCost && n != c) { bestCost = cost; bestChild = (int)c; bestOther = (int)n; bestTarget = pos[n]; } } /*! if we did not find a swap that improves the SAH then do nothing */ if (bestCost >= 0) return 1+reduce_max(cdepth); /*! perform the best found tree rotation */ Node* child = parent->child[bestChild]->node(); swap(parent,bestOther,child,bestTarget); parent->lower_x[bestChild] = reduce_min(child->lower_x); parent->lower_y[bestChild] = reduce_min(child->lower_y); parent->lower_z[bestChild] = reduce_min(child->lower_z); parent->upper_x[bestChild] = reduce_max(child->upper_x); parent->upper_y[bestChild] = reduce_max(child->upper_y); parent->upper_z[bestChild] = reduce_max(child->upper_z); parent->lower_dx[bestChild] = reduce_min(child->lower_dx); parent->lower_dy[bestChild] = reduce_min(child->lower_dy); parent->lower_dz[bestChild] = reduce_min(child->lower_dz); parent->upper_dx[bestChild] = reduce_max(child->upper_dx); parent->upper_dy[bestChild] = reduce_max(child->upper_dy); parent->upper_dz[bestChild] = reduce_max(child->upper_dz); /*! This returned depth is conservative as the child that was * pulled up in the tree could have been on the critical path. */ cdepth[bestOther]++; // bestOther was pushed down one level return 1+reduce_max(cdepth); }
void BVH4HairBuilder::parallelBinningGlobal(const size_t threadID, const size_t numThreads) { BuildRecord ¤t = global_sharedData.rec; const unsigned int items = current.items(); const unsigned int startID = current.begin + ((threadID+0)*items/numThreads); const unsigned int endID = current.begin + ((threadID+1)*items/numThreads); const mic_f centroidMin = broadcast4to16f(¤t.bounds.centroid2.lower); const mic_f centroidMax = broadcast4to16f(¤t.bounds.centroid2.upper); const mic_f centroidBoundsMin_2 = centroidMin; const mic_f centroidDiagonal_2 = centroidMax-centroidMin; const mic_f scale = select(centroidDiagonal_2 != 0.0f,rcp(centroidDiagonal_2) * mic_f(16.0f * 0.99f),mic_f::zero()); Bezier1i *__restrict__ const tmp_prims = (Bezier1i*)accel; fastbin_copy<Bezier1i,false>(prims,tmp_prims,startID,endID,centroidBoundsMin_2,scale,global_bin16[threadID]); LockStepTaskScheduler::syncThreadsWithReduction( threadID, numThreads, reduceBinsParallel, global_bin16 ); if (threadID == 0) { const float voxelArea = area(current.bounds.geometry); global_sharedData.split.cost = items * voxelArea * INTERSECTION_COST;; const Bin16 &bin16 = global_bin16[0]; for (size_t dim=0;dim<3;dim++) { if (unlikely(centroidDiagonal_2[dim] == 0.0f)) continue; const mic_f rArea = prefix_area_rl(bin16.min_x[dim],bin16.min_y[dim],bin16.min_z[dim], bin16.max_x[dim],bin16.max_y[dim],bin16.max_z[dim]); const mic_f lArea = prefix_area_lr(bin16.min_x[dim],bin16.min_y[dim],bin16.min_z[dim], bin16.max_x[dim],bin16.max_y[dim],bin16.max_z[dim]); const mic_i lnum = prefix_count(bin16.count[dim]); const mic_i rnum = mic_i(items) - lnum; const mic_i lblocks = (lnum + mic_i(3)) >> 2; const mic_i rblocks = (rnum + mic_i(3)) >> 2; const mic_m m_lnum = lnum == 0; const mic_m m_rnum = rnum == 0; const mic_f cost = select(m_lnum|m_rnum,mic_f::inf(),lArea * mic_f(lblocks) + rArea * mic_f(rblocks) + voxelArea ); if (lt(cost,mic_f(global_sharedData.split.cost))) { const mic_f min_cost = vreduce_min(cost); const mic_m m_pos = min_cost == cost; const unsigned long pos = bitscan64(m_pos); assert(pos < 15); if (pos < 15) { global_sharedData.split.cost = cost[pos]; global_sharedData.split.pos = pos+1; global_sharedData.split.dim = dim; global_sharedData.split.numLeft = lnum[pos]; } } } } }
size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) { /*! nothing to rotate if we reached a leaf node. */ if (parentRef.isBarrier()) return 0; if (parentRef.isLeaf()) return 0; Node* parent = parentRef.node(); /*! rotate all children first */ vint4 cdepth; for (size_t c=0; c<4; c++) cdepth[c] = (int)rotate(parent->child(c),depth+1); /* compute current areas of all children */ vfloat4 sizeX = parent->upper_x-parent->lower_x; vfloat4 sizeY = parent->upper_y-parent->lower_y; vfloat4 sizeZ = parent->upper_z-parent->lower_z; vfloat4 childArea = sizeX*(sizeY + sizeZ) + sizeY*sizeZ; /*! get node bounds */ BBox<vfloat4> child1_0,child1_1,child1_2,child1_3; parent->bounds(child1_0,child1_1,child1_2,child1_3); /*! Find best rotation. We pick a first child (child1) and a sub-child (child2child) of a different second child (child2), and swap child1 and child2child. We perform the best such swap. */ float bestArea = 0; size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; for (size_t c2=0; c2<4; c2++) { /*! ignore leaf nodes as we cannot descent into them */ if (parent->child(c2).isBarrier()) continue; if (parent->child(c2).isLeaf()) continue; Node* child2 = parent->child(c2).node(); /*! transpose child bounds */ BBox<vfloat4> child2c0,child2c1,child2c2,child2c3; child2->bounds(child2c0,child2c1,child2c2,child2c3); /*! put child1_0 at each child2 position */ float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3)); float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3)); float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3)); float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0)); vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); vfloat4 min0 = vreduce_min(cost0); int pos0 = (int)__bsf(movemask(min0 == cost0)); /*! put child1_1 at each child2 position */ float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3)); float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3)); float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3)); float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1)); vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); vfloat4 min1 = vreduce_min(cost1); int pos1 = (int)__bsf(movemask(min1 == cost1)); /*! put child1_2 at each child2 position */ float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3)); float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3)); float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3)); float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2)); vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); vfloat4 min2 = vreduce_min(cost2); int pos2 = (int)__bsf(movemask(min2 == cost2)); /*! put child1_3 at each child2 position */ float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3)); float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3)); float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3)); float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3)); vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); vfloat4 min3 = vreduce_min(cost3); int pos3 = (int)__bsf(movemask(min3 == cost3)); /*! find best other child */ vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]); int pos[4] = { pos0,pos1,pos2,pos3 }; const size_t mbd = BVH4::maxBuildDepth; vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints valid &= vint4(c2) != vint4(step); if (none(valid)) continue; size_t c1 = select_min(valid,area0123); float area = area0123[c1]; if (c1 == c2) continue; // can happen if bounds are NANs /*! accept a swap when it reduces cost and is not swapping a node with itself */ if (area < bestArea) { bestArea = area; bestChild1 = c1; bestChild2 = c2; bestChild2Child = pos[c1]; } } /*! if we did not find a swap that improves the SAH then do nothing */ if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth); /*! perform the best found tree rotation */ Node* child2 = parent->child(bestChild2).node(); BVH4::swap(parent,bestChild1,child2,bestChild2Child); parent->set(bestChild2,child2->bounds()); BVH4::compact(parent); BVH4::compact(child2); /*! This returned depth is conservative as the child that was * pulled up in the tree could have been on the critical path. */ cdepth[bestChild1]++; // bestChild1 was pushed down one level return 1+reduce_max(cdepth); }