int volk_fec_rank_archs( const char *kern_name, //name of the kernel to rank const char *impl_names[], //list of implementations by name const int* impl_deps, //requirement mask per implementation const bool* alignment, //alignment status of each implementation size_t n_impls, //number of implementations available const bool align //if false, filter aligned implementations ){ size_t i; static volk_fec_arch_pref_t *volk_fec_arch_prefs; static size_t n_arch_prefs = 0; static int prefs_loaded = 0; if(!prefs_loaded) { n_arch_prefs = volk_fec_load_preferences(&volk_fec_arch_prefs); prefs_loaded = 1; } //now look for the function name in the prefs list for(i = 0; i < n_arch_prefs; i++) { if(!strncmp(kern_name, volk_fec_arch_prefs[i].name, sizeof(volk_fec_arch_prefs[i].name))) //found it { printf("%s ", volk_fec_arch_prefs[i].name); const char *impl_name = align? volk_fec_arch_prefs[i].impl_a : volk_fec_arch_prefs[i].impl_u; return volk_fec_get_index(impl_names, n_impls, impl_name); } } //return the best index with the largest deps size_t best_index_a = 0; size_t best_index_u = 0; int best_value_a = -1; int best_value_u = -1; for(i = 0; i < n_impls; i++) { const signed val = __popcnt(impl_deps[i]); if (alignment[i] && val > best_value_a) { best_index_a = i; best_value_a = val; } if (!alignment[i] && val > best_value_u) { best_index_u = i; best_value_u = val; } } //when align and we found a best aligned, use it if (align && best_value_a != -1) return best_index_a; //otherwise return the best unaligned return best_index_u; }
void FullScreenPass::init(const std::string& vsFile, const std::string& psFile, const Program::DefineList& programDefines, bool disableDepth, bool disableStencil, uint32_t viewportMask, bool enableSPS) { mpPipelineState = GraphicsState::create(); mpPipelineState->toggleSinglePassStereo(enableSPS); // create depth stencil state DepthStencilState::Desc dsDesc; dsDesc.setDepthTest(!disableDepth); dsDesc.setDepthWriteMask(!disableDepth); dsDesc.setDepthFunc(DepthStencilState::Func::LessEqual); // Equal is needed to allow overdraw when z is enabled (e.g., background pass etc.) dsDesc.setStencilTest(!disableStencil); dsDesc.setStencilWriteMask(!disableStencil); mpDepthStencilState = DepthStencilState::create(dsDesc); Program::DefineList defs = programDefines; std::string gs; if(viewportMask) { defs.add("_VIEWPORT_MASK", std::to_string(viewportMask)); if(checkForViewportArray2Support()) { defs.add("_USE_VP2_EXT"); } else { defs.add("_OUTPUT_VERTEX_COUNT", std::to_string(3 * __popcnt(viewportMask))); #ifdef FALCOR_VK gs = "Framework/Shaders/FullScreenPass.gs.glsl"; #else gs = "Framework/Shaders/FullScreenPass.gs.slang"; #endif } } const std::string vs(vsFile.empty() ? "Framework/Shaders/FullScreenPass.vs.slang" : vsFile); mpProgram = GraphicsProgram::createFromFile(vs, psFile, gs, "", "", defs); mpPipelineState->setProgram(mpProgram); if (FullScreenPass::spVertexBuffer == nullptr) { initStaticObjects(spVertexBuffer, spVao); } mpPipelineState->setVao(FullScreenPass::spVao); }
Size countBits(Size a) { #ifdef __GNUC__ #ifdef __X64__ return __builtin_popcountl(a); #else return __builtin_popcount(a); #endif #elif defined(_MSC_VER) #ifdef __X64__ return __popcnt64(a); #else return __popcnt(a); #endif #else //Very naive implementation. Size c = 0; while(a) { if(a & 1) c++; a >>= 1; } return c; #endif }
// reduction operations INLINE size_t popcnt(const avxb& a) {return __popcnt(_mm256_movemask_ps(a));}
void BVH8iIntersector8Hybrid<TriangleIntersector8>::occluded(avxb* valid_i, BVH8i* bvh, Ray8& ray) { /* load ray */ const avxb valid = *valid_i; avxb terminated = !valid; avx3f ray_org = ray.org, ray_dir = ray.dir; avxf ray_tnear = ray.tnear, ray_tfar = ray.tfar; #if defined(__FIX_RAYS__) const avxf float_range = 0.1f*FLT_MAX; ray_org = clamp(ray_org,avx3f(-float_range),avx3f(+float_range)); ray_dir = clamp(ray_dir,avx3f(-float_range),avx3f(+float_range)); ray_tnear = max(ray_tnear,FLT_MIN); ray_tfar = min(ray_tfar,float(inf)); #endif const avx3f rdir = rcp_safe(ray_dir); const avx3f org(ray_org), org_rdir = org * rdir; ray_tnear = select(valid,ray_tnear,avxf(pos_inf)); ray_tfar = select(valid,ray_tfar ,avxf(neg_inf)); const avxf inf = avxf(pos_inf); /* compute near/far per ray */ avx3i nearXYZ; nearXYZ.x = select(rdir.x >= 0.0f,avxi(0*(int)sizeof(avxf)),avxi(1*(int)sizeof(avxf))); nearXYZ.y = select(rdir.y >= 0.0f,avxi(2*(int)sizeof(avxf)),avxi(3*(int)sizeof(avxf))); nearXYZ.z = select(rdir.z >= 0.0f,avxi(4*(int)sizeof(avxf)),avxi(5*(int)sizeof(avxf))); /* allocate stack and push root node */ avxf stack_near[stackSizeChunk]; NodeRef stack_node[stackSizeChunk]; stack_node[0] = BVH4i::invalidNode; stack_near[0] = inf; stack_node[1] = bvh->root; stack_near[1] = ray_tnear; NodeRef* stackEnd = stack_node+stackSizeChunk; NodeRef* __restrict__ sptr_node = stack_node + 2; avxf* __restrict__ sptr_near = stack_near + 2; const Node * __restrict__ nodes = (Node *)bvh->nodePtr(); const Triangle * __restrict__ accel = (Triangle*)bvh->triPtr(); while (1) { /* pop next node from stack */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; NodeRef curNode = *sptr_node; if (unlikely(curNode == BVH4i::invalidNode)) { assert(sptr_node == stack_node); break; } /* cull node if behind closest hit point */ avxf curDist = *sptr_near; const avxb active = curDist < ray_tfar; if (unlikely(none(active))) continue; /* switch to single ray traversal */ #if !defined(__WIN32__) || defined(__X86_64__) size_t bits = movemask(active); if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) { for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) { if (occluded1(bvh,curNode,i,ray,ray_org,ray_dir,rdir,ray_tnear,ray_tfar,nearXYZ)) terminated[i] = -1; } if (all(terminated)) break; ray_tfar = select(terminated,avxf(neg_inf),ray_tfar); continue; } #endif while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf())) break; const avxb valid_node = ray_tfar > curDist; STAT3(shadow.trav_nodes,1,popcnt(valid_node),8); const Node* __restrict__ const node = (Node*)curNode.node(nodes); /* pop of next node */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; curNode = *sptr_node; curDist = *sptr_near; for (unsigned i=0; i<8; i++) { const NodeRef child = node->children[i]; if (unlikely(child == BVH4i::emptyNode)) break; #if defined(__AVX2__) const avxf lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x); const avxf lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y); const avxf lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z); const avxf lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x); const avxf lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y); const avxf lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z); const avxf lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); const avxf lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); const avxb lhit = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar); #else const avxf lclipMinX = (node->lower_x[i] - org.x) * rdir.x; const avxf lclipMinY = (node->lower_y[i] - org.y) * rdir.y; const avxf lclipMinZ = (node->lower_z[i] - org.z) * rdir.z; const avxf lclipMaxX = (node->upper_x[i] - org.x) * rdir.x; const avxf lclipMaxY = (node->upper_y[i] - org.y) * rdir.y; const avxf lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z; const avxf lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); const avxf lfarP = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); const avxb lhit = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar); #endif /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(lhit))) { assert(sptr_node < stackEnd); assert(child != BVH4i::emptyNode); const avxf childDist = select(lhit,lnearP,inf); sptr_node++; sptr_near++; /* push cur node onto stack and continue with hit child */ if (any(childDist < curDist)) { *(sptr_node-1) = curNode; *(sptr_near-1) = curDist; curDist = childDist; curNode = child; } /* push hit child onto stack */ else { *(sptr_node-1) = child; *(sptr_near-1) = childDist; } } } } /* return if stack is empty */ if (unlikely(curNode == BVH4i::invalidNode)) { assert(sptr_node == stack_node); break; } /* intersect leaf */ const avxb valid_leaf = ray_tfar > curDist; STAT3(shadow.trav_leaves,1,popcnt(valid_leaf),8); size_t items; const Triangle* prim = (Triangle*) curNode.leaf(accel,items); terminated |= TriangleIntersector8::occluded(!terminated,ray,prim,items,bvh->geometry); if (all(terminated)) break; ray_tfar = select(terminated,avxf(neg_inf),ray_tfar); } store8i(valid & terminated,&ray.geomID,0); AVX_ZERO_UPPER(); }
void BVH4Intersector4Hybrid<types,robust,PrimitiveIntersector4>::intersect(bool4* valid_i, BVH4* bvh, Ray4& ray) { /* verify correct input */ bool4 valid0 = *valid_i; #if defined(RTCORE_IGNORE_INVALID_RAYS) valid0 &= ray.valid(); #endif assert(all(valid0,ray.tnear > -FLT_MIN)); assert(!(types & BVH4::FLAG_NODE_MB) || all(valid0,ray.time >= 0.0f & ray.time <= 1.0f)); /* load ray */ Vec3f4 ray_org = ray.org; Vec3f4 ray_dir = ray.dir; float4 ray_tnear = ray.tnear, ray_tfar = ray.tfar; const Vec3f4 rdir = rcp_safe(ray_dir); const Vec3f4 org(ray_org), org_rdir = org * rdir; ray_tnear = select(valid0,ray_tnear,float4(pos_inf)); ray_tfar = select(valid0,ray_tfar ,float4(neg_inf)); const float4 inf = float4(pos_inf); Precalculations pre(valid0,ray); /* compute near/far per ray */ Vec3i4 nearXYZ; nearXYZ.x = select(rdir.x >= 0.0f,int4(0*(int)sizeof(float4)),int4(1*(int)sizeof(float4))); nearXYZ.y = select(rdir.y >= 0.0f,int4(2*(int)sizeof(float4)),int4(3*(int)sizeof(float4))); nearXYZ.z = select(rdir.z >= 0.0f,int4(4*(int)sizeof(float4)),int4(5*(int)sizeof(float4))); /* allocate stack and push root node */ float4 stack_near[stackSizeChunk]; NodeRef stack_node[stackSizeChunk]; stack_node[0] = BVH4::invalidNode; stack_near[0] = inf; stack_node[1] = bvh->root; stack_near[1] = ray_tnear; NodeRef* stackEnd = stack_node+stackSizeChunk; NodeRef* __restrict__ sptr_node = stack_node + 2; float4* __restrict__ sptr_near = stack_near + 2; while (1) pop: { /* pop next node from stack */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; NodeRef cur = *sptr_node; if (unlikely(cur == BVH4::invalidNode)) { assert(sptr_node == stack_node); break; } /* cull node if behind closest hit point */ float4 curDist = *sptr_near; const bool4 active = curDist < ray_tfar; if (unlikely(none(active))) continue; /* switch to single ray traversal */ #if !defined(__WIN32__) || defined(__X86_64__) size_t bits = movemask(active); if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) { for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) { BVH4Intersector4Single<types,robust,PrimitiveIntersector4>::intersect1(bvh, cur, i, pre, ray, ray_org, ray_dir, rdir, ray_tnear, ray_tfar, nearXYZ); } ray_tfar = min(ray_tfar,ray.tfar); continue; } #endif while (1) { /* process normal nodes */ if (likely((types & 0x1) && cur.isNode())) { const bool4 valid_node = ray_tfar > curDist; STAT3(normal.trav_nodes,1,popcnt(valid_node),4); const Node* __restrict__ const node = cur.node(); /* pop of next node */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; cur = *sptr_node; curDist = *sptr_near; #pragma unroll(4) for (unsigned i=0; i<BVH4::N; i++) { const NodeRef child = node->children[i]; if (unlikely(child == BVH4::emptyNode)) break; float4 lnearP; const bool4 lhit = intersect_node<robust>(node,i,org,rdir,org_rdir,ray_tnear,ray_tfar,lnearP); /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(lhit))) { assert(sptr_node < stackEnd); assert(child != BVH4::emptyNode); const float4 childDist = select(lhit,lnearP,inf); sptr_node++; sptr_near++; /* push cur node onto stack and continue with hit child */ if (any(childDist < curDist)) { *(sptr_node-1) = cur; *(sptr_near-1) = curDist; curDist = childDist; cur = child; } /* push hit child onto stack */ else { *(sptr_node-1) = child; *(sptr_near-1) = childDist; } } } #if SWITCH_DURING_DOWN_TRAVERSAL == 1 // seems to be the best place for testing utilization if (unlikely(popcnt(ray_tfar > curDist) <= SWITCH_THRESHOLD)) { *sptr_node++ = cur; *sptr_near++ = curDist; goto pop; } #endif } /* process motion blur nodes */ else if (likely((types & 0x10) && cur.isNodeMB())) { const bool4 valid_node = ray_tfar > curDist; STAT3(normal.trav_nodes,1,popcnt(valid_node),4); const BVH4::NodeMB* __restrict__ const node = cur.nodeMB(); /* pop of next node */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; cur = *sptr_node; curDist = *sptr_near; #pragma unroll(4) for (unsigned i=0; i<BVH4::N; i++) { const NodeRef child = node->child(i); if (unlikely(child == BVH4::emptyNode)) break; float4 lnearP; const bool4 lhit = intersect_node(node,i,org,rdir,org_rdir,ray_tnear,ray_tfar,ray.time,lnearP); /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(lhit))) { assert(sptr_node < stackEnd); assert(child != BVH4::emptyNode); const float4 childDist = select(lhit,lnearP,inf); sptr_node++; sptr_near++; /* push cur node onto stack and continue with hit child */ if (any(childDist < curDist)) { *(sptr_node-1) = cur; *(sptr_near-1) = curDist; curDist = childDist; cur = child; } /* push hit child onto stack */ else { *(sptr_node-1) = child; *(sptr_near-1) = childDist; } } } #if SWITCH_DURING_DOWN_TRAVERSAL == 1 // seems to be the best place for testing utilization if (unlikely(popcnt(ray_tfar > curDist) <= SWITCH_THRESHOLD)) { *sptr_node++ = cur; *sptr_near++ = curDist; goto pop; } #endif } else break; }
void BVH8Intersector8Hybrid<PrimitiveIntersector8>::occluded(bool8* valid_i, BVH8* bvh, Ray8& ray) { /* load ray */ const bool8 valid = *valid_i; bool8 terminated = !valid; Vec3f8 ray_org = ray.org, ray_dir = ray.dir; float8 ray_tnear = ray.tnear, ray_tfar = ray.tfar; const Vec3f8 rdir = rcp_safe(ray_dir); const Vec3f8 org(ray_org), org_rdir = org * rdir; ray_tnear = select(valid,ray_tnear,float8(pos_inf)); ray_tfar = select(valid,ray_tfar ,float8(neg_inf)); const float8 inf = float8(pos_inf); Precalculations pre(valid,ray); /* compute near/far per ray */ Vec3i8 nearXYZ; nearXYZ.x = select(rdir.x >= 0.0f,int8(0*(int)sizeof(float8)),int8(1*(int)sizeof(float8))); nearXYZ.y = select(rdir.y >= 0.0f,int8(2*(int)sizeof(float8)),int8(3*(int)sizeof(float8))); nearXYZ.z = select(rdir.z >= 0.0f,int8(4*(int)sizeof(float8)),int8(5*(int)sizeof(float8))); /* allocate stack and push root node */ float8 stack_near[stackSizeChunk]; NodeRef stack_node[stackSizeChunk]; stack_node[0] = BVH8::invalidNode; stack_near[0] = inf; stack_node[1] = bvh->root; stack_near[1] = ray_tnear; NodeRef* stackEnd = stack_node+stackSizeChunk; NodeRef* __restrict__ sptr_node = stack_node + 2; float8* __restrict__ sptr_near = stack_near + 2; while (1) { /* pop next node from stack */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; NodeRef cur = *sptr_node; if (unlikely(cur == BVH8::invalidNode)) { assert(sptr_node == stack_node); break; } /* cull node if behind closest hit point */ float8 curDist = *sptr_near; const bool8 active = curDist < ray_tfar; if (unlikely(none(active))) continue; /* switch to single ray traversal */ #if !defined(__WIN32__) || defined(__X86_64__) size_t bits = movemask(active); if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) { for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) { if (occluded1(bvh,cur,i,pre,ray,ray_org,ray_dir,rdir,ray_tnear,ray_tfar,nearXYZ)) terminated[i] = -1; } if (all(terminated)) break; ray_tfar = select(terminated,float8(neg_inf),ray_tfar); continue; } #endif while (1) { /* test if this is a leaf node */ if (unlikely(cur.isLeaf())) break; const bool8 valid_node = ray_tfar > curDist; STAT3(shadow.trav_nodes,1,popcnt(valid_node),8); const Node* __restrict__ const node = (Node*)cur.node(); /* pop of next node */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; cur = *sptr_node; curDist = *sptr_near; for (unsigned i=0; i<BVH8::N; i++) { const NodeRef child = node->children[i]; if (unlikely(child == BVH8::emptyNode)) break; #if defined(__AVX2__) const float8 lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x); const float8 lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y); const float8 lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z); const float8 lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x); const float8 lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y); const float8 lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z); const float8 lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); const float8 lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); const bool8 lhit = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar); #else const float8 lclipMinX = (node->lower_x[i] - org.x) * rdir.x; const float8 lclipMinY = (node->lower_y[i] - org.y) * rdir.y; const float8 lclipMinZ = (node->lower_z[i] - org.z) * rdir.z; const float8 lclipMaxX = (node->upper_x[i] - org.x) * rdir.x; const float8 lclipMaxY = (node->upper_y[i] - org.y) * rdir.y; const float8 lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z; const float8 lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); const float8 lfarP = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); const bool8 lhit = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar); #endif /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(lhit))) { assert(sptr_node < stackEnd); assert(child != BVH8::emptyNode); const float8 childDist = select(lhit,lnearP,inf); sptr_node++; sptr_near++; /* push cur node onto stack and continue with hit child */ if (any(childDist < curDist)) { *(sptr_node-1) = cur; *(sptr_near-1) = curDist; curDist = childDist; cur = child; } /* push hit child onto stack */ else { *(sptr_node-1) = child; *(sptr_near-1) = childDist; } } } } /* return if stack is empty */ if (unlikely(cur == BVH8::invalidNode)) { assert(sptr_node == stack_node); break; } /* intersect leaf */ assert(cur != BVH8::emptyNode); const bool8 valid_leaf = ray_tfar > curDist; STAT3(shadow.trav_leaves,1,popcnt(valid_leaf),8); size_t items; const Triangle* prim = (Triangle*) cur.leaf(items); terminated |= PrimitiveIntersector8::occluded(!terminated,pre,ray,prim,items,bvh->scene); if (all(terminated)) break; ray_tfar = select(terminated,float8(neg_inf),ray_tfar); } store8i(valid & terminated,&ray.geomID,0); AVX_ZERO_UPPER(); }
int volk_gnsssdr_rank_archs( const char *kern_name, //name of the kernel to rank const char *impl_names[], //list of implementations by name const int *impl_deps, //requirement mask per implementation const bool *alignment, //alignment status of each implementation size_t n_impls, //number of implementations available const bool align //if false, filter aligned implementations ) { size_t i; static volk_gnsssdr_arch_pref_t *volk_gnsssdr_arch_prefs; static size_t n_arch_prefs = 0; static int prefs_loaded = 0; if (!prefs_loaded) { n_arch_prefs = volk_gnsssdr_load_preferences(&volk_gnsssdr_arch_prefs); prefs_loaded = 1; } // If we've defined VOLK_GENERIC to be anything, always return the // 'generic' kernel. Used in GR's QA code. char *gen_env = getenv("VOLK_GENERIC"); if (gen_env) { return volk_gnsssdr_get_index(impl_names, n_impls, "generic"); } //now look for the function name in the prefs list for (i = 0; i < n_arch_prefs; i++) { if (!strncmp(kern_name, volk_gnsssdr_arch_prefs[i].name, sizeof(volk_gnsssdr_arch_prefs[i].name))) //found it { const char *impl_name = align ? volk_gnsssdr_arch_prefs[i].impl_a : volk_gnsssdr_arch_prefs[i].impl_u; return volk_gnsssdr_get_index(impl_names, n_impls, impl_name); } } //return the best index with the largest deps size_t best_index_a = 0; size_t best_index_u = 0; int best_value_a = -1; int best_value_u = -1; for (i = 0; i < n_impls; i++) { const signed val = __popcnt(impl_deps[i]); if (alignment[i] && val > best_value_a) { best_index_a = i; best_value_a = val; } if (!alignment[i] && val > best_value_u) { best_index_u = i; best_value_u = val; } } //when align and we found a best aligned, use it if (align && best_value_a != -1) return best_index_a; //otherwise return the best unaligned return best_index_u; }
void BVH4Intersector4Hybrid<PrimitiveIntersector4>::intersect(sseb* valid_i, BVH4* bvh, Ray4& ray) { /* load ray */ const sseb valid0 = *valid_i; sse3f ray_org = ray.org, ray_dir = ray.dir; ssef ray_tnear = ray.tnear, ray_tfar = ray.tfar; #if defined(__FIX_RAYS__) const ssef float_range = 0.1f*FLT_MAX; ray_org = clamp(ray_org,sse3f(-float_range),sse3f(+float_range)); ray_dir = clamp(ray_dir,sse3f(-float_range),sse3f(+float_range)); ray_tnear = max(ray_tnear,FLT_MIN); ray_tfar = min(ray_tfar,float(inf)); #endif const sse3f rdir = rcp_safe(ray_dir); const sse3f org(ray_org), org_rdir = org * rdir; ray_tnear = select(valid0,ray_tnear,ssef(pos_inf)); ray_tfar = select(valid0,ray_tfar ,ssef(neg_inf)); const ssef inf = ssef(pos_inf); /* allocate stack and push root node */ ssef stack_near[stackSizeChunk]; NodeRef stack_node[stackSizeChunk]; stack_node[0] = BVH4::invalidNode; stack_near[0] = inf; stack_node[1] = bvh->root; stack_near[1] = ray_tnear; NodeRef* stackEnd = stack_node+stackSizeChunk; NodeRef* __restrict__ sptr_node = stack_node + 2; ssef* __restrict__ sptr_near = stack_near + 2; while (1) { /* pop next node from stack */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; NodeRef curNode = *sptr_node; if (unlikely(curNode == BVH4::invalidNode)) { assert(sptr_node == stack_node); break; } /* cull node if behind closest hit point */ ssef curDist = *sptr_near; const sseb active = curDist < ray_tfar; if (unlikely(none(active))) continue; /* switch to single ray traversal */ #if !defined(__WIN32__) || defined(__X86_64__) size_t bits = movemask(active); if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) { for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) { intersect1(bvh,curNode,i,ray,ray_org,ray_dir,rdir,ray_tnear,ray_tfar); } ray_tfar = ray.tfar; continue; } #endif while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf())) break; const sseb valid_node = ray_tfar > curDist; STAT3(normal.trav_nodes,1,popcnt(valid_node),4); const Node* __restrict__ const node = curNode.node(); /* pop of next node */ assert(sptr_node > stack_node); sptr_node--; sptr_near--; curNode = *sptr_node; curDist = *sptr_near; #pragma unroll(4) for (unsigned i=0; i<4; i++) { const NodeRef child = node->children[i]; if (unlikely(child == BVH4::emptyNode)) break; #if defined(__AVX2__) const ssef lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x); const ssef lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y); const ssef lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z); const ssef lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x); const ssef lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y); const ssef lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z); #else const ssef lclipMinX = (node->lower_x[i] - org.x) * rdir.x; const ssef lclipMinY = (node->lower_y[i] - org.y) * rdir.y; const ssef lclipMinZ = (node->lower_z[i] - org.z) * rdir.z; const ssef lclipMaxX = (node->upper_x[i] - org.x) * rdir.x; const ssef lclipMaxY = (node->upper_y[i] - org.y) * rdir.y; const ssef lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z; #endif #if defined(__SSE4_1__) const ssef lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ)); const ssef lfarP = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ)); const sseb lhit = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar); #else const ssef lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ)); const ssef lfarP = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ)); const sseb lhit = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar); #endif /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(lhit))) { assert(sptr_node < stackEnd); const ssef childDist = select(lhit,lnearP,inf); const NodeRef child = node->children[i]; assert(child != BVH4::emptyNode); sptr_node++; sptr_near++; /* push cur node onto stack and continue with hit child */ if (any(childDist < curDist)) { *(sptr_node-1) = curNode; *(sptr_near-1) = curDist; curDist = childDist; curNode = child; } /* push hit child onto stack */ else { *(sptr_node-1) = child; *(sptr_near-1) = childDist; } } } } /* return if stack is empty */ if (unlikely(curNode == BVH4::invalidNode)) { assert(sptr_node == stack_node); break; } /* intersect leaf */ const sseb valid_leaf = ray_tfar > curDist; STAT3(normal.trav_leaves,1,popcnt(valid_leaf),4); size_t items; const Primitive* prim = (Primitive*) curNode.leaf(items); PrimitiveIntersector4::intersect(valid_leaf,ray,prim,items,bvh->geometry); ray_tfar = select(valid_leaf,ray.tfar,ray_tfar); } AVX_ZERO_UPPER(); }
static inline int popcnt(std::uint32_t n) { return __popcnt(n); }
void BVH4Intersector4Hybrid<TriangleIntersector4>::intersect(const BVH4Intersector4Hybrid* This, Ray4& ray, const __m128 valid_i) { sseb valid = valid_i; const BVH4* bvh = This->bvh; STAT3(normal.travs,1,popcnt(valid),4); NodeRef invalid = (NodeRef)1; /* load ray into registers */ ssef ray_near = select(valid,ray.tnear,pos_inf); ssef ray_far = select(valid,ray.tfar ,neg_inf); sse3f rdir = rcp_safe(ray.dir); ray.tfar = ray_far; /* allocate stack and push root node */ NodeRef stack_node[3*BVH4::maxDepth+1]; ssef stack_near[3*BVH4::maxDepth+1]; stack_node[0] = invalid; stack_near[0] = ssef(inf); stack_node[1] = bvh->root; stack_near[1] = ray_near; NodeRef* sptr_node = stack_node+2; ssef * sptr_near = stack_near+2; while (1) { /* pop next node from stack */ sptr_node--; sptr_near--; ssef curDist = *sptr_near; NodeRef curNode = *sptr_node; if (unlikely(curNode == invalid)) break; /* cull node if behind closest hit point */ const sseb active = curDist < ray_far; if (unlikely(none(active))) continue; /* switch to single ray traversal */ size_t bits = movemask(active); if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) { for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) { BVH4Intersector1<TriangleIntersector1>::intersect1(bvh,curNode,i,ray,rdir); } ray_far = ray.tfar; continue; } while (1) { /* test if this is a leaf node */ if (unlikely(curNode.isLeaf())) break; const Node* const node = curNode.node(bvh->nodePtr()); //NodeRef(curNode).node(nodes); /* pop of next node */ sptr_node--; sptr_near--; curNode = *sptr_node; curDist = *sptr_near; for (unsigned i=0; i<4; i++) { const ssef dminx = (ssef(node->lower_x[i]) - ray.org.x) * rdir.x; const ssef dmaxx = (ssef(node->upper_x[i]) - ray.org.x) * rdir.x; const ssef dminy = (ssef(node->lower_y[i]) - ray.org.y) * rdir.y; const ssef dmaxy = (ssef(node->upper_y[i]) - ray.org.y) * rdir.y; const ssef dminz = (ssef(node->lower_z[i]) - ray.org.z) * rdir.z; const ssef dmaxz = (ssef(node->upper_z[i]) - ray.org.z) * rdir.z; const NodeRef child = node->child(i); const ssef dlowerx = min(dminx,dmaxx); const ssef dupperx = max(dminx,dmaxx); const ssef dlowery = min(dminy,dmaxy); const ssef duppery = max(dminy,dmaxy); const ssef dlowerz = min(dminz,dmaxz); const ssef dupperz = max(dminz,dmaxz); const ssef near = max(dlowerx,dlowery,dlowerz,ray_near); const ssef far = min(dupperx,duppery,dupperz,ray_far ); const sseb mhit = near <= far; const ssef childDist = select(mhit,near,inf); const sseb closer = childDist < curDist; /* if we hit the child we choose to continue with that child if it is closer than the current next child, or we push it onto the stack */ if (likely(any(mhit))) { //if (child != invalid) { sptr_node++; sptr_near++; /* push cur node onto stack and continue with hit child */ if (any(closer)) { *(sptr_node-1) = curNode; *(sptr_near-1) = curDist; curDist = childDist; curNode = child; } /* push hit child onto stack*/ else { *(sptr_node-1) = child; *(sptr_near-1) = childDist; } } } } } /* return if stack is empty */ if (unlikely(curNode == invalid)) break; /* decode leaf node */ size_t num; Triangle* tri = (Triangle*) curNode.leaf(bvh->triPtr(),num); /* intersect triangles */ for (size_t i=0; i<num; i++) TriangleIntersector4::intersect(valid,ray,tri[i],bvh->vertices); ray_far = ray.tfar; } }