namespace isa { static unsigned int BVH4I_LEAF_MASK = BVH4i::leaf_mask; // needed due to compiler efficiency bug template<typename LeafIntersector> void BVH4mbIntersector16Single<LeafIntersector>::intersect(mic_i* valid_i, BVH4mb* bvh, Ray16& ray16) { /* near and node stack */ __aligned(64) float stack_dist[3*BVH4i::maxDepth+1]; __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; /* setup */ const mic_m m_valid = *(mic_i*)valid_i != mic_i(0); const mic3f rdir16 = rcp_safe(ray16.dir); const mic_f inf = mic_f(pos_inf); const mic_f zero = mic_f::zero(); store16f(stack_dist,inf); const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const BVH4mb::Triangle01 * __restrict__ accel = (BVH4mb::Triangle01 *)bvh->triPtr(); stack_node[0] = BVH4i::invalidNode; long rayIndex = -1; while((rayIndex = bitscan64(rayIndex,toInt(m_valid))) != BITSCAN_NO_BIT_SET_64) { stack_node[1] = bvh->root; size_t sindex = 2; const mic_f org_xyz = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z); const mic_f dir_xyz = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z);
void BVH4iIntersector1::occluded(BVH4i* bvh, Ray& ray) { /* near and node stack */ __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; /* setup */ const mic3f rdir16 = rcp_safe(mic3f(ray.dir.x,ray.dir.y,ray.dir.z)); const mic_f inf = mic_f(pos_inf); const mic_f zero = mic_f::zero(); const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr(); stack_node[0] = BVH4i::invalidNode; stack_node[1] = bvh->root; size_t sindex = 2; const mic_f org_xyz = loadAOS4to16f(ray.org.x,ray.org.y,ray.org.z); const mic_f dir_xyz = loadAOS4to16f(ray.dir.x,ray.dir.y,ray.dir.z); const mic_f rdir_xyz = loadAOS4to16f(rdir16.x[0],rdir16.y[0],rdir16.z[0]); const mic_f org_rdir_xyz = org_xyz * rdir_xyz; const mic_f min_dist_xyz = broadcast1to16f(&ray.tnear); const mic_f max_dist_xyz = broadcast1to16f(&ray.tfar); const unsigned int leaf_mask = BVH4I_LEAF_MASK; while (1) { NodeRef curNode = stack_node[sindex-1]; sindex--; while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf(leaf_mask))) break; const Node* __restrict__ const node = curNode.node(nodes); const float* __restrict const plower = (float*)node->lower; const float* __restrict const pupper = (float*)node->upper; prefetch<PFHINT_L1>((char*)node + 0); prefetch<PFHINT_L1>((char*)node + 64); /* intersect single ray with 4 bounding boxes */ const mic_f tLowerXYZ = load16f(plower) * rdir_xyz - org_rdir_xyz; const mic_f tUpperXYZ = load16f(pupper) * rdir_xyz - org_rdir_xyz; const mic_f tLower = mask_min(0x7777,min_dist_xyz,tLowerXYZ,tUpperXYZ); const mic_f tUpper = mask_max(0x7777,max_dist_xyz,tLowerXYZ,tUpperXYZ); sindex--; curNode = stack_node[sindex]; const Node* __restrict__ const next = curNode.node(nodes); prefetch<PFHINT_L2>((char*)next + 0); prefetch<PFHINT_L2>((char*)next + 64); const mic_f tNear = vreduce_max4(tLower); const mic_f tFar = vreduce_min4(tUpper); const mic_m hitm = le(0x8888,tNear,tFar); const mic_f tNear_pos = select(hitm,tNear,inf); /* if no child is hit, continue with early popped child */ if (unlikely(none(hitm))) continue; sindex++; const unsigned long hiti = toInt(hitm); const unsigned long pos_first = bitscan64(hiti); const unsigned long num_hitm = countbits(hiti); /* if a single child is hit, continue with that child */ curNode = ((unsigned int *)plower)[pos_first]; if (likely(num_hitm == 1)) continue; /* if two children are hit, push in correct order */ const unsigned long pos_second = bitscan64(pos_first,hiti); if (likely(num_hitm == 2)) { const unsigned int dist_first = ((unsigned int*)&tNear)[pos_first]; const unsigned int dist_second = ((unsigned int*)&tNear)[pos_second]; const unsigned int node_first = curNode; const unsigned int node_second = ((unsigned int*)plower)[pos_second]; if (dist_first <= dist_second) { stack_node[sindex] = node_second; sindex++; assert(sindex < 3*BVH4i::maxDepth+1); continue; } else { stack_node[sindex] = curNode; curNode = node_second; sindex++; assert(sindex < 3*BVH4i::maxDepth+1); continue; } } /* continue with closest child and push all others */ const mic_f min_dist = set_min_lanes(tNear_pos); const unsigned old_sindex = sindex; sindex += countbits(hiti) - 1; assert(sindex < 3*BVH4i::maxDepth+1); const mic_m closest_child = eq(hitm,min_dist,tNear); const unsigned long closest_child_pos = bitscan64(closest_child); const mic_m m_pos = andn(hitm,andn(closest_child,(mic_m)((unsigned int)closest_child - 1))); const mic_i plower_node = load16i((int*)plower); curNode = ((unsigned int*)plower)[closest_child_pos]; compactustore16i(m_pos,&stack_node[old_sindex],plower_node); } /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect one ray against four triangles */ ////////////////////////////////////////////////////////////////////////////////////////////////// const Triangle1* tptr = (Triangle1*) curNode.leaf(accel); prefetch<PFHINT_L1>(tptr + 3); prefetch<PFHINT_L1>(tptr + 2); prefetch<PFHINT_L1>(tptr + 1); prefetch<PFHINT_L1>(tptr + 0); const mic_i and_mask = broadcast4to16i(zlc4); const mic_f v0 = gather_4f_zlc(and_mask, (float*)&tptr[0].v0, (float*)&tptr[1].v0, (float*)&tptr[2].v0, (float*)&tptr[3].v0); const mic_f v1 = gather_4f_zlc(and_mask, (float*)&tptr[0].v1, (float*)&tptr[1].v1, (float*)&tptr[2].v1, (float*)&tptr[3].v1); const mic_f v2 = gather_4f_zlc(and_mask, (float*)&tptr[0].v2, (float*)&tptr[1].v2, (float*)&tptr[2].v2, (float*)&tptr[3].v2); const mic_f e1 = v1 - v0; const mic_f e2 = v0 - v2; const mic_f normal = lcross_zxy(e1,e2); const mic_f org = v0 - org_xyz; const mic_f odzxy = msubr231(org * swizzle(dir_xyz,_MM_SWIZ_REG_DACB), dir_xyz, swizzle(org,_MM_SWIZ_REG_DACB)); const mic_f den = ldot3_zxy(dir_xyz,normal); const mic_f rcp_den = rcp(den); const mic_f uu = ldot3_zxy(e2,odzxy); const mic_f vv = ldot3_zxy(e1,odzxy); const mic_f u = uu * rcp_den; const mic_f v = vv * rcp_den; #if defined(__BACKFACE_CULLING__) const mic_m m_init = (mic_m)0x1111 & (den > zero); #else const mic_m m_init = 0x1111; #endif const mic_m valid_u = ge(m_init,u,zero); const mic_m valid_v = ge(valid_u,v,zero); const mic_m m_aperture = le(valid_v,u+v,mic_f::one()); const mic_f nom = ldot3_zxy(org,normal); const mic_f t = rcp_den*nom; if (unlikely(none(m_aperture))) continue; mic_m m_final = lt(lt(m_aperture,min_dist_xyz,t),t,max_dist_xyz); #if defined(__USE_RAY_MASK__) const mic_i rayMask(ray.mask); const mic_i triMask = swDDDD(gather16i_4i_align(&tptr[0].v2,&tptr[1].v2,&tptr[2].v2,&tptr[3].v2)); const mic_m m_ray_mask = (rayMask & triMask) != mic_i::zero(); m_final &= m_ray_mask; #endif #if defined(__INTERSECTION_FILTER__) /* did the ray hit one of the four triangles? */ while (any(m_final)) { const mic_f temp_t = select(m_final,t,max_dist_xyz); const mic_f min_dist = vreduce_min(temp_t); const mic_m m_dist = eq(min_dist,temp_t); const size_t vecIndex = bitscan(toInt(m_dist)); const size_t triIndex = vecIndex >> 2; const Triangle1 *__restrict__ tri_ptr = tptr + triIndex; const mic_m m_tri = m_dist^(m_dist & (mic_m)((unsigned int)m_dist - 1)); const mic_f gnormalx = mic_f(tri_ptr->Ng.x); const mic_f gnormaly = mic_f(tri_ptr->Ng.y); const mic_f gnormalz = mic_f(tri_ptr->Ng.z); const int geomID = tri_ptr->geomID(); const int primID = tri_ptr->primID(); Geometry* geom = ((Scene*)bvh->geometry)->get(geomID); if (likely(!geom->hasOcclusionFilter1())) break; if (runOcclusionFilter1(geom,ray,u,v,min_dist,gnormalx,gnormaly,gnormalz,m_tri,geomID,primID)) break; m_final ^= m_tri; /* clear bit */ } #endif if (unlikely(any(m_final))) { ray.geomID = 0; return; } ////////////////////////////////////////////////////////////////////////////////////////////////// } }
void BVH4HairBuilder::parallelPartitioning(BuildRecord& current, Bezier1i * __restrict__ l_source, Bezier1i * __restrict__ r_source, Bezier1i * __restrict__ l_dest, Bezier1i * __restrict__ r_dest, const Split &split, Centroid_Scene_AABB &local_left, Centroid_Scene_AABB &local_right) { const mic_f centroidMin = broadcast4to16f(¤t.bounds.centroid2.lower); const mic_f centroidMax = broadcast4to16f(¤t.bounds.centroid2.upper); const mic_f centroidBoundsMin_2 = centroidMin; const mic_f centroidDiagonal_2 = centroidMax-centroidMin; const mic_f scale = select(centroidDiagonal_2 != 0.0f,rcp(centroidDiagonal_2) * mic_f(16.0f * 0.99f),mic_f::zero()); const unsigned int bestSplitDim = split.dim; const unsigned int bestSplit = split.pos; const mic_f c = mic_f(centroidBoundsMin_2[bestSplitDim]); const mic_f s = mic_f(scale[bestSplitDim]); mic_f leftSceneBoundsMin((float)pos_inf); mic_f leftSceneBoundsMax((float)neg_inf); mic_f leftCentroidBoundsMin((float)pos_inf); mic_f leftCentroidBoundsMax((float)neg_inf); mic_f rightSceneBoundsMin((float)pos_inf); mic_f rightSceneBoundsMax((float)neg_inf); mic_f rightCentroidBoundsMin((float)pos_inf);
void BVH4HairBuilder::parallelBinningGlobal(const size_t threadID, const size_t numThreads) { BuildRecord ¤t = global_sharedData.rec; const unsigned int items = current.items(); const unsigned int startID = current.begin + ((threadID+0)*items/numThreads); const unsigned int endID = current.begin + ((threadID+1)*items/numThreads); const mic_f centroidMin = broadcast4to16f(¤t.bounds.centroid2.lower); const mic_f centroidMax = broadcast4to16f(¤t.bounds.centroid2.upper); const mic_f centroidBoundsMin_2 = centroidMin; const mic_f centroidDiagonal_2 = centroidMax-centroidMin; const mic_f scale = select(centroidDiagonal_2 != 0.0f,rcp(centroidDiagonal_2) * mic_f(16.0f * 0.99f),mic_f::zero()); Bezier1i *__restrict__ const tmp_prims = (Bezier1i*)accel; fastbin_copy<Bezier1i,false>(prims,tmp_prims,startID,endID,centroidBoundsMin_2,scale,global_bin16[threadID]); LockStepTaskScheduler::syncThreadsWithReduction( threadID, numThreads, reduceBinsParallel, global_bin16 ); if (threadID == 0) { const float voxelArea = area(current.bounds.geometry); global_sharedData.split.cost = items * voxelArea * INTERSECTION_COST;; const Bin16 &bin16 = global_bin16[0]; for (size_t dim=0;dim<3;dim++) { if (unlikely(centroidDiagonal_2[dim] == 0.0f)) continue; const mic_f rArea = prefix_area_rl(bin16.min_x[dim],bin16.min_y[dim],bin16.min_z[dim], bin16.max_x[dim],bin16.max_y[dim],bin16.max_z[dim]); const mic_f lArea = prefix_area_lr(bin16.min_x[dim],bin16.min_y[dim],bin16.min_z[dim], bin16.max_x[dim],bin16.max_y[dim],bin16.max_z[dim]); const mic_i lnum = prefix_count(bin16.count[dim]); const mic_i rnum = mic_i(items) - lnum; const mic_i lblocks = (lnum + mic_i(3)) >> 2; const mic_i rblocks = (rnum + mic_i(3)) >> 2; const mic_m m_lnum = lnum == 0; const mic_m m_rnum = rnum == 0; const mic_f cost = select(m_lnum|m_rnum,mic_f::inf(),lArea * mic_f(lblocks) + rArea * mic_f(rblocks) + voxelArea ); if (lt(cost,mic_f(global_sharedData.split.cost))) { const mic_f min_cost = vreduce_min(cost); const mic_m m_pos = min_cost == cost; const unsigned long pos = bitscan64(m_pos); assert(pos < 15); if (pos < 15) { global_sharedData.split.cost = cost[pos]; global_sharedData.split.pos = pos+1; global_sharedData.split.dim = dim; global_sharedData.split.numLeft = lnum[pos]; } } } } }
void BVH4iIntersector16Chunk<LeafIntersector,ENABLE_COMPRESSED_BVH4I_NODES>::occluded(mic_i* valid_i, BVH4i* bvh, Ray16& ray) { /* allocate stack */ __aligned(64) mic_f stack_dist[3*BVH4i::maxDepth+1]; __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; /* load ray */ const mic_m valid = *(mic_i*)valid_i != mic_i(0); mic_m m_terminated = !valid; const mic3f rdir = rcp_safe(ray.dir); const mic3f org_rdir = ray.org * rdir; mic_f ray_tnear = select(valid,ray.tnear,pos_inf); mic_f ray_tfar = select(valid,ray.tfar ,neg_inf); const mic_f inf = mic_f(pos_inf); /* push root node */ stack_node[0] = BVH4i::invalidNode; stack_dist[0] = inf; stack_node[1] = bvh->root; stack_dist[1] = ray_tnear; NodeRef* __restrict__ sptr_node = stack_node + 2; mic_f* __restrict__ sptr_dist = stack_dist + 2; const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr(); const mic3f org = ray.org; const mic3f dir = ray.dir; while (1) { const mic_m m_active = !m_terminated; /* pop next node from stack */ NodeRef curNode = *(sptr_node-1); mic_f curDist = *(sptr_dist-1); sptr_node--; sptr_dist--; const mic_m m_stackDist = gt(m_active,ray_tfar,curDist); /* stack emppty ? */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* cull node if behind closest hit point */ if (unlikely(none(m_stackDist))) { continue; } const unsigned int leaf_mask = BVH4I_LEAF_MASK; traverse_chunk_occluded<ENABLE_COMPRESSED_BVH4I_NODES>(curNode, curDist, rdir, org_rdir, ray_tnear, ray_tfar, m_active, sptr_node, sptr_dist, nodes, leaf_mask); /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect leaf */ mic_m m_valid_leaf = gt(m_active,ray_tfar,curDist); STAT3(shadow.trav_leaves,1,popcnt(m_valid_leaf),16); LeafIntersector::occluded16(curNode,m_valid_leaf,dir,org,ray,m_terminated,accel,(Scene*)bvh->geometry); if (unlikely(all(m_terminated))) break; ray_tfar = select(m_terminated,neg_inf,ray_tfar); } store16i(valid & m_terminated,&ray.geomID,0); }
namespace embree { const mic_f BVH4AOS::initial_node = select(0x8888,cast_to_mic_f(mic_i(leaf_mask)),mic_f(1E10)); /*! intersector registration functions */ void BVH4AOSIntersector1Register(); void BVH4AOSIntersector16ChunkRegister(); void BVH4AOSTriangle1Intersector1Register(); void BVH4AOSTriangle1Intersector16SingleRegister(); void BVH4AOSTriangle1Intersector16ChunkRegister(); void BVH4AOSTriangle1Intersector16HybridRegister(); void BVH4AOSTriangle1Intersector1RefRegister(); void BVH4AOSTriangle1Intersector16ChunkRefRegister(); void BVH4AOSTriangle1Intersector16HybridRefRegister(); void BVH4AOSRegister () { /* register acceleration structure */ TriangleMesh::accels.add("bvh4aos",BVH4AOS::create); VirtualScene ::accels.add("bvh4aos",BVH4AOS::create); TriangleMesh::accels.setDefaultAccel("bvh4aos"); VirtualScene ::accels.setDefaultAccel("bvh4aos"); TriangleMesh::accels.setDefaultTriangle("bvh4aos","triangle1"); VirtualScene ::accels.setDefaultTriangle("bvh4aos","virtual"); //TriangleMesh::builders.setDefaultBuilder ("bvh4aos","objectsplit"); TriangleMesh::builders.setDefaultBuilder ("bvh4aos","bowpass"); VirtualScene ::builders.setDefaultBuilder ("bvh4aos","objectsplit"); /* register triangle mesh builders */ //TriangleMesh::builders.add("bvh4aos","triangle1i","objectsplit" ,BVHBuilderT<BVH4AOS, HeuristicBinning<0> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle1v","objectsplit" ,BVHBuilderT<BVH4AOS, HeuristicBinning<0> >::create,4,4); //TriangleMesh::builders.add("bvh4aos","triangle4i","objectsplit" ,BVHBuilderT<BVH4AOS, HeuristicBinning<2> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle4v","objectsplit" ,BVHBuilderT<BVH4AOS, HeuristicBinning<2> >::create,1,inf); TriangleMesh::builders.add("bvh4aos","triangle1","objectsplit" ,BVHBuilderT<BVH4AOS, HeuristicBinning<0> >::create,4,4); //TriangleMesh::builders.add("bvh4aos","triangle4" ,"objectsplit", BVHBuilderT<BVH4AOS, HeuristicBinning<2> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle8" ,"objectsplit", BVHBuilderT<BVH4AOS, HeuristicBinning<3> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle1i","spatialsplit" ,BVHBuilderT<BVH4AOS, HeuristicSpatial<0> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle1v","spatialsplit" ,BVHBuilderT<BVH4AOS, HeuristicSpatial<0> >::create,4,4); //TriangleMesh::builders.add("bvh4aos","triangle4i","spatialsplit" ,BVHBuilderT<BVH4AOS, HeuristicSpatial<2> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle4v","spatialsplit" ,BVHBuilderT<BVH4AOS, HeuristicSpatial<2> >::create,1,inf); TriangleMesh::builders.add("bvh4aos","triangle1","spatialsplit" ,BVHBuilderT<BVH4AOS, HeuristicSpatial<0> >::create,4,4); //TriangleMesh::builders.add("bvh4aos","triangle4" ,"spatialsplit", BVHBuilderT<BVH4AOS, HeuristicSpatial<2> >::create,1,inf); //TriangleMesh::builders.add("bvh4aos","triangle8" ,"spatialsplit", BVHBuilderT<BVH4AOS, HeuristicSpatial<3> >::create,1,inf); TriangleMesh::builders.add("bvh4aos","triangle1","bowpass" ,BVH4AOSBuilder::create,4,4); TriangleMesh::builders.add("bvh4aos","triangle1","bowpass_fast" ,BVH4AOSBuilder::create_fast,4,4); /* register virtual object builders */ VirtualScene::builders.add("bvh4aos","virtual","objectsplit", BVHBuilderT<BVH4AOS, HeuristicBinning<0> >::create,1,inf); /* register intersectors */ BVH4AOSIntersector1Register(); BVH4AOSIntersector16ChunkRegister(); BVH4AOSTriangle1Intersector1Register(); BVH4AOSTriangle1Intersector16SingleRegister(); BVH4AOSTriangle1Intersector16ChunkRegister(); BVH4AOSTriangle1Intersector16HybridRegister(); BVH4AOSTriangle1Intersector1RefRegister(); BVH4AOSTriangle1Intersector16ChunkRefRegister(); BVH4AOSTriangle1Intersector16HybridRefRegister(); } void BVH4AOS::init(size_t numNodes, size_t numTriangles) { root = 0; size_t bytesNode = max(size_t(1 << index_shift),sizeof(Node)); size_t bytesTri = max(size_t(1 << index_shift),trity.bytes); alloc_nodes->init(numNodes*bytesNode); alloc_tris ->init(numTriangles*bytesTri); } void BVH4AOS::resize (size_t bytesNodeArray, size_t bytesTriangleArray) { root = NULL; alloc_nodes->init(bytesNodeArray); alloc_tris ->init(bytesTriangleArray); } void BVH4AOS::print() { BVHStatisticsT<BVH4AOS>(this).print(); } }
void BVH4iIntersector16Hybrid<LeafIntersector,ENABLE_COMPRESSED_BVH4I_NODES>::intersect(mic_i* valid_i, BVH4i* bvh, Ray16& ray16) { /* near and node stack */ __aligned(64) mic_f stack_dist[3*BVH4i::maxDepth+1]; __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; __aligned(64) NodeRef stack_node_single[3*BVH4i::maxDepth+1]; /* load ray */ const mic_m valid0 = *(mic_i*)valid_i != mic_i(0); const mic3f rdir16 = rcp_safe(ray16.dir); const mic3f org_rdir16 = ray16.org * rdir16; mic_f ray_tnear = select(valid0,ray16.tnear,pos_inf); mic_f ray_tfar = select(valid0,ray16.tfar ,neg_inf); const mic_f inf = mic_f(pos_inf); /* allocate stack and push root node */ stack_node[0] = BVH4i::invalidNode; stack_dist[0] = inf; stack_node[1] = bvh->root; stack_dist[1] = ray_tnear; NodeRef* __restrict__ sptr_node = stack_node + 2; mic_f* __restrict__ sptr_dist = stack_dist + 2; const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr(); while (1) pop: { /* pop next node from stack */ NodeRef curNode = *(sptr_node-1); mic_f curDist = *(sptr_dist-1); sptr_node--; sptr_dist--; const mic_m m_stackDist = ray_tfar > curDist; /* stack emppty ? */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* cull node if behind closest hit point */ if (unlikely(none(m_stackDist))) continue; /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /* switch to single ray mode */ if (unlikely(countbits(m_stackDist) <= BVH4i::hybridSIMDUtilSwitchThreshold)) { float *__restrict__ stack_dist_single = (float*)sptr_dist; store16f(stack_dist_single,inf); /* traverse single ray */ long rayIndex = -1; while((rayIndex = bitscan64(rayIndex,m_stackDist)) != BITSCAN_NO_BIT_SET_64) { stack_node_single[0] = BVH4i::invalidNode; stack_node_single[1] = curNode; size_t sindex = 2; const mic_f org_xyz = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z); const mic_f dir_xyz = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z); const mic_f rdir_xyz = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z); const mic_f org_rdir_xyz = org_xyz * rdir_xyz; const mic_f min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]); mic_f max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]); const unsigned int leaf_mask = BVH4I_LEAF_MASK; while (1) { NodeRef curNode = stack_node_single[sindex-1]; sindex--; traverse_single_intersect<ENABLE_COMPRESSED_BVH4I_NODES>(curNode, sindex, rdir_xyz, org_rdir_xyz, min_dist_xyz, max_dist_xyz, stack_node_single, stack_dist_single, nodes, leaf_mask); /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect one ray against four triangles */ const bool hit = LeafIntersector::intersect(curNode, rayIndex, dir_xyz, org_xyz, min_dist_xyz, max_dist_xyz, ray16, accel, (Scene*)bvh->geometry); if (hit) compactStack(stack_node_single,stack_dist_single,sindex,max_dist_xyz); } } ray_tfar = select(valid0,ray16.tfar ,neg_inf); continue; } /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////// /////////////////////////////////////////////////////////////////////////////////////////////////////////////// const unsigned int leaf_mask = BVH4I_LEAF_MASK; const mic3f org = ray16.org; const mic3f dir = ray16.dir; while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf(leaf_mask))) break; STAT3(normal.trav_nodes,1,popcnt(ray_tfar > curDist),16); const Node* __restrict__ const node = curNode.node(nodes); prefetch<PFHINT_L1>((mic_f*)node + 0); prefetch<PFHINT_L1>((mic_f*)node + 1); /* pop of next node */ sptr_node--; sptr_dist--; curNode = *sptr_node; curDist = *sptr_dist; #pragma unroll(4) for (unsigned int i=0; i<4; i++) { BVH4i::NodeRef child; mic_f lclipMinX,lclipMinY,lclipMinZ; mic_f lclipMaxX,lclipMaxY,lclipMaxZ; if (!ENABLE_COMPRESSED_BVH4I_NODES) { child = node->lower[i].child; lclipMinX = msub(node->lower[i].x,rdir16.x,org_rdir16.x); lclipMinY = msub(node->lower[i].y,rdir16.y,org_rdir16.y); lclipMinZ = msub(node->lower[i].z,rdir16.z,org_rdir16.z); lclipMaxX = msub(node->upper[i].x,rdir16.x,org_rdir16.x); lclipMaxY = msub(node->upper[i].y,rdir16.y,org_rdir16.y); lclipMaxZ = msub(node->upper[i].z,rdir16.z,org_rdir16.z); } else { BVH4i::QuantizedNode* __restrict__ const compressed_node = (BVH4i::QuantizedNode*)node; child = compressed_node->child(i); const mic_f startXYZ = compressed_node->decompress_startXYZ(); const mic_f diffXYZ = compressed_node->decompress_diffXYZ(); const mic_f clower = compressed_node->decompress_lowerXYZ(startXYZ,diffXYZ); const mic_f cupper = compressed_node->decompress_upperXYZ(startXYZ,diffXYZ); lclipMinX = msub(mic_f(clower[4*i+0]),rdir16.x,org_rdir16.x); lclipMinY = msub(mic_f(clower[4*i+1]),rdir16.y,org_rdir16.y); lclipMinZ = msub(mic_f(clower[4*i+2]),rdir16.z,org_rdir16.z); lclipMaxX = msub(mic_f(cupper[4*i+0]),rdir16.x,org_rdir16.x); lclipMaxY = msub(mic_f(cupper[4*i+1]),rdir16.y,org_rdir16.y); lclipMaxZ = msub(mic_f(cupper[4*i+2]),rdir16.z,org_rdir16.z); } if (unlikely(i >=2 && child == BVH4i::invalidNode)) break; const mic_f lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); const mic_f lfarP = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); const mic_m lhit = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar); const mic_f childDist = select(lhit,lnearP,inf); const mic_m m_child_dist = childDist < curDist; /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(lhit))) { sptr_node++; sptr_dist++; /* push cur node onto stack and continue with hit child */ if (any(m_child_dist)) { *(sptr_node-1) = curNode; *(sptr_dist-1) = curDist; curDist = childDist; curNode = child; } /* push hit child onto stack*/ else { *(sptr_node-1) = child; *(sptr_dist-1) = childDist; const char* __restrict__ const pnode = (char*)child.node(nodes); prefetch<PFHINT_L2>(pnode + 0); prefetch<PFHINT_L2>(pnode + 64); } assert(sptr_node - stack_node < BVH4i::maxDepth); } } #if SWITCH_ON_DOWN_TRAVERSAL == 1 const mic_m curUtil = ray_tfar > curDist; if (unlikely(countbits(curUtil) <= BVH4i::hybridSIMDUtilSwitchThreshold)) { *sptr_node++ = curNode; *sptr_dist++ = curDist; goto pop; } #endif } /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect leaf */ const mic_m m_valid_leaf = ray_tfar > curDist; STAT3(normal.trav_leaves,1,popcnt(m_valid_leaf),16); LeafIntersector::intersect16(curNode,m_valid_leaf,dir,org,ray16,accel,(Scene*)bvh->geometry); ray_tfar = select(m_valid_leaf,ray16.tfar,ray_tfar); }
void BVH4mbIntersector16Single::occluded(mic_i* valid_i, BVH4mb* bvh, Ray16& ray16) { /* near and node stack */ __align(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; /* setup */ const mic_m m_valid = *(mic_i*)valid_i != mic_i(0); const mic3f rdir16 = rcp_safe(ray16.dir); unsigned int terminated = toInt(!m_valid); const mic_f inf = mic_f(pos_inf); const mic_f zero = mic_f::zero(); const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const BVH4mb::Triangle01 * __restrict__ accel = (BVH4mb::Triangle01 *)bvh->triPtr(); stack_node[0] = BVH4i::invalidNode; long rayIndex = -1; while((rayIndex = bitscan64(rayIndex,toInt(m_valid))) != BITSCAN_NO_BIT_SET_64) { stack_node[1] = bvh->root; size_t sindex = 2; const mic_f org_xyz = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z); const mic_f dir_xyz = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z); const mic_f rdir_xyz = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z); const mic_f org_rdir_xyz = org_xyz * rdir_xyz; const mic_f min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]); const mic_f max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]); const mic_f time = broadcast1to16f(&ray16.time[rayIndex]); const unsigned int leaf_mask = BVH4I_LEAF_MASK; while (1) { NodeRef curNode = stack_node[sindex-1]; sindex--; const mic_f one_time = (mic_f::one() - time); while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf(leaf_mask))) break; const Node* __restrict__ const node = curNode.node(nodes); const float* __restrict const plower = (float*)node->lower; const float* __restrict const pupper = (float*)node->upper; prefetch<PFHINT_L1>((char*)node + 0*64); prefetch<PFHINT_L1>((char*)node + 1*64); prefetch<PFHINT_L1>((char*)node + 2*64); prefetch<PFHINT_L1>((char*)node + 3*64); const BVH4mb::Node* __restrict__ const nodeMB = (BVH4mb::Node*)node; const mic_f lower = one_time * load16f((float*)nodeMB->lower) + time * load16f((float*)nodeMB->lower_t1); const mic_f upper = one_time * load16f((float*)nodeMB->upper) + time * load16f((float*)nodeMB->upper_t1); /* intersect single ray with 4 bounding boxes */ const mic_f tLowerXYZ = lower * rdir_xyz - org_rdir_xyz; const mic_f tUpperXYZ = upper * rdir_xyz - org_rdir_xyz; const mic_f tLower = mask_min(0x7777,min_dist_xyz,tLowerXYZ,tUpperXYZ); const mic_f tUpper = mask_max(0x7777,max_dist_xyz,tLowerXYZ,tUpperXYZ); const Node* __restrict__ const next = curNode.node(nodes); prefetch<PFHINT_L2>((char*)next + 0); prefetch<PFHINT_L2>((char*)next + 64); sindex--; const mic_f tNear = vreduce_max4(tLower); const mic_f tFar = vreduce_min4(tUpper); const mic_m hitm = le(0x8888,tNear,tFar); const mic_f tNear_pos = select(hitm,tNear,inf); curNode = stack_node[sindex]; // early pop of next node /* if no child is hit, continue with early popped child */ if (unlikely(none(hitm))) continue; sindex++; const unsigned long hiti = toInt(hitm); const unsigned long pos_first = bitscan64(hiti); const unsigned long num_hitm = countbits(hiti); /* if a single child is hit, continue with that child */ curNode = ((unsigned int *)plower)[pos_first]; if (likely(num_hitm == 1)) continue; /* if two children are hit, push in correct order */ const unsigned long pos_second = bitscan64(pos_first,hiti); if (likely(num_hitm == 2)) { const unsigned int dist_first = ((unsigned int*)&tNear)[pos_first]; const unsigned int dist_second = ((unsigned int*)&tNear)[pos_second]; const unsigned int node_first = curNode; const unsigned int node_second = ((unsigned int*)plower)[pos_second]; if (dist_first <= dist_second) { stack_node[sindex] = node_second; sindex++; assert(sindex < 3*BVH4i::maxDepth+1); continue; } else { stack_node[sindex] = curNode; curNode = node_second; sindex++; assert(sindex < 3*BVH4i::maxDepth+1); continue; } } /* continue with closest child and push all others */ const mic_f min_dist = set_min_lanes(tNear_pos); const unsigned int old_sindex = sindex; sindex += countbits(hiti) - 1; assert(sindex < 3*BVH4i::maxDepth+1); const mic_m closest_child = eq(hitm,min_dist,tNear); const unsigned long closest_child_pos = bitscan64(closest_child); const mic_m m_pos = andn(hitm,andn(closest_child,(mic_m)((unsigned int)closest_child - 1))); const mic_i plower_node = load16i((int*)plower); curNode = ((unsigned int*)plower)[closest_child_pos]; compactustore16i(m_pos,&stack_node[old_sindex],plower_node); } /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect one ray against four triangles */ ////////////////////////////////////////////////////////////////////////////////////////////////// const BVH4mb::Triangle01* tptr = (BVH4mb::Triangle01*) curNode.leaf(accel); prefetch<PFHINT_L1>((mic_f*)tptr + 0); prefetch<PFHINT_L1>((mic_f*)tptr + 1); prefetch<PFHINT_L1>((mic_f*)tptr + 2); prefetch<PFHINT_L1>((mic_f*)tptr + 3); const mic_i and_mask = broadcast4to16i(zlc4); const mic_f v0_t0 = gather_4f_zlc(and_mask, (float*)&tptr[0].t0.v0, (float*)&tptr[1].t0.v0, (float*)&tptr[2].t0.v0, (float*)&tptr[3].t0.v0); const mic_f v1_t0 = gather_4f_zlc(and_mask, (float*)&tptr[0].t0.v1, (float*)&tptr[1].t0.v1, (float*)&tptr[2].t0.v1, (float*)&tptr[3].t0.v1); const mic_f v2_t0 = gather_4f_zlc(and_mask, (float*)&tptr[0].t0.v2, (float*)&tptr[1].t0.v2, (float*)&tptr[2].t0.v2, (float*)&tptr[3].t0.v2); prefetch<PFHINT_L2>((mic_f*)tptr + 4); prefetch<PFHINT_L2>((mic_f*)tptr + 5); prefetch<PFHINT_L2>((mic_f*)tptr + 6); prefetch<PFHINT_L2>((mic_f*)tptr + 7); const mic_f v0_t1 = gather_4f_zlc(and_mask, (float*)&tptr[0].t1.v0, (float*)&tptr[1].t1.v0, (float*)&tptr[2].t1.v0, (float*)&tptr[3].t1.v0); const mic_f v1_t1 = gather_4f_zlc(and_mask, (float*)&tptr[0].t1.v1, (float*)&tptr[1].t1.v1, (float*)&tptr[2].t1.v1, (float*)&tptr[3].t1.v1); const mic_f v2_t1 = gather_4f_zlc(and_mask, (float*)&tptr[0].t1.v2, (float*)&tptr[1].t1.v2, (float*)&tptr[2].t1.v2, (float*)&tptr[3].t1.v2); const mic_f v0 = v0_t0 * one_time + time * v0_t1; const mic_f v1 = v1_t0 * one_time + time * v1_t1; const mic_f v2 = v2_t0 * one_time + time * v2_t1; const mic_f e1 = v1 - v0; const mic_f e2 = v0 - v2; const mic_f normal = lcross_zxy(e1,e2); const mic_f org = v0 - org_xyz; const mic_f odzxy = msubr231(org * swizzle(dir_xyz,_MM_SWIZ_REG_DACB), dir_xyz, swizzle(org,_MM_SWIZ_REG_DACB)); const mic_f den = ldot3_zxy(dir_xyz,normal); const mic_f rcp_den = rcp(den); const mic_f uu = ldot3_zxy(e2,odzxy); const mic_f vv = ldot3_zxy(e1,odzxy); const mic_f u = uu * rcp_den; const mic_f v = vv * rcp_den; #if defined(__BACKFACE_CULLING__) const mic_m m_init = (mic_m)0x1111 & (den > zero); #else const mic_m m_init = 0x1111; #endif const mic_m valid_u = ge((mic_m)m_init,u,zero); const mic_m valid_v = ge(valid_u,v,zero); const mic_m m_aperture = le(valid_v,u+v,mic_f::one()); const mic_f nom = ldot3_zxy(org,normal); const mic_f t = rcp_den*nom; if (unlikely(none(m_aperture))) continue; mic_m m_final = lt(lt(m_aperture,min_dist_xyz,t),t,max_dist_xyz); #if defined(__USE_RAY_MASK__) const mic_i rayMask(ray16.mask[rayIndex]); const mic_i triMask = swDDDD(gather16i_4i_align(&tptr[0].t0.v2,&tptr[1].t0.v2,&tptr[2].t0.v2,&tptr[3].t0.v2)); const mic_m m_ray_mask = (rayMask & triMask) != mic_i::zero(); m_final &= m_ray_mask; #endif if (unlikely(any(m_final))) { terminated |= mic_m::shift1[rayIndex]; break; } ////////////////////////////////////////////////////////////////////////////////////////////////// } if (unlikely(all(toMask(terminated)))) break; } store16i(m_valid & toMask(terminated),&ray16.geomID,0); }
void BVH4mbIntersector16Chunk<LeafIntersector>::intersect(mic_i* valid_i, BVH4mb* bvh, Ray16& ray) { /* near and node stack */ __aligned(64) mic_f stack_dist[3*BVH4i::maxDepth+1]; __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1]; /* load ray */ const mic_m valid0 = *(mic_i*)valid_i != mic_i(0); const mic3f rdir = rcp_safe(ray.dir); const mic3f org_rdir = ray.org * rdir; mic_f ray_tnear = select(valid0,ray.tnear,pos_inf); mic_f ray_tfar = select(valid0,ray.tfar ,neg_inf); const mic_f inf = mic_f(pos_inf); /* allocate stack and push root node */ stack_node[0] = BVH4i::invalidNode; stack_dist[0] = inf; stack_node[1] = bvh->root; stack_dist[1] = ray_tnear; NodeRef* __restrict__ sptr_node = stack_node + 2; mic_f* __restrict__ sptr_dist = stack_dist + 2; const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const BVH4mb::Triangle01 * __restrict__ accel = (BVH4mb::Triangle01 *)bvh->triPtr(); while (1) { /* pop next node from stack */ NodeRef curNode = *(sptr_node-1); mic_f curDist = *(sptr_dist-1); sptr_node--; sptr_dist--; const mic_m m_stackDist = ray_tfar > curDist; /* stack emppty ? */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* cull node if behind closest hit point */ if (unlikely(none(m_stackDist))) { continue; } const unsigned int leaf_mask = BVH4I_LEAF_MASK; traverse_chunk_intersect(curNode, curDist, rdir, org_rdir, ray_tnear, ray_tfar, ray.time, sptr_node, sptr_dist, nodes, leaf_mask); /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) break; /* intersect leaf */ const mic_m m_valid_leaf = ray_tfar > curDist; STAT3(normal.trav_leaves,1,popcnt(m_valid_leaf),16); LeafIntersector::intersect16(curNode, m_valid_leaf, ray.dir, ray.org, ray, accel, (Scene*)bvh->geometry); ray_tfar = select(m_valid_leaf,ray.tfar,ray_tfar); } }