void BezierCurves::interpolate(unsigned primID, float u, float v, RTCBufferType buffer, float* P, float* dPdu, float* dPdv, size_t numFloats) { /* test if interpolation is enabled */ #if defined(DEBUG) if ((parent->aflags & RTC_INTERPOLATE) == 0) throw_RTCError(RTC_INVALID_OPERATION,"rtcInterpolate can only get called when RTC_INTERPOLATE is enabled for the scene"); #endif /* calculate base pointer and stride */ assert((buffer >= RTC_VERTEX_BUFFER0 && buffer <= RTC_VERTEX_BUFFER1) || (buffer >= RTC_USER_VERTEX_BUFFER0 && buffer <= RTC_USER_VERTEX_BUFFER1)); const char* src = nullptr; size_t stride = 0; if (buffer >= RTC_USER_VERTEX_BUFFER0) { src = userbuffers[buffer&0xFFFF]->getPtr(); stride = userbuffers[buffer&0xFFFF]->getStride(); } else { src = vertices[buffer&0xFFFF].getPtr(); stride = vertices[buffer&0xFFFF].getStride(); } #if !defined(__MIC__) for (size_t i=0; i<numFloats; i+=4) { size_t ofs = i*sizeof(float); const size_t curve = curves[primID]; const vfloat4 p0 = vfloat4::loadu((float*)&src[(curve+0)*stride+ofs]); const vfloat4 p1 = vfloat4::loadu((float*)&src[(curve+1)*stride+ofs]); const vfloat4 p2 = vfloat4::loadu((float*)&src[(curve+2)*stride+ofs]); const vfloat4 p3 = vfloat4::loadu((float*)&src[(curve+3)*stride+ofs]); const vbool4 valid = vint4(i)+vint4(step) < vint4(numFloats); const BezierCurveT<vfloat4> bezier(p0,p1,p2,p3,0.0f,1.0f,0); vfloat4 Q, dQdu; bezier.eval(u,Q,dQdu); if (P ) vfloat4::storeu(valid,P+i,Q); if (dPdu) vfloat4::storeu(valid,dPdu+i,dQdu); } #else for (size_t i=0; i<numFloats; i+=16) { size_t ofs = i*sizeof(float); vbool16 mask = (i+16 > numFloats) ? (vbool16)(((unsigned int)1 << (numFloats-i))-1) : vbool16( true ); const size_t curve = curves[primID]; const vfloat16 p0 = vfloat16::loadu(mask,(float*)&src[(curve+0)*stride+ofs]); const vfloat16 p1 = vfloat16::loadu(mask,(float*)&src[(curve+1)*stride+ofs]); const vfloat16 p2 = vfloat16::loadu(mask,(float*)&src[(curve+2)*stride+ofs]); const vfloat16 p3 = vfloat16::loadu(mask,(float*)&src[(curve+3)*stride+ofs]); const BezierCurveT<vfloat16> bezier(p0,p1,p2,p3,0.0f,1.0f,0); vfloat16 Q, dQdu; bezier.eval(u,Q,dQdu); if (P ) vfloat16::storeu_compact(mask,P+i,Q); if (dPdu) vfloat16::storeu_compact(mask,dPdu+i,dQdu); } #endif }
void QuadMesh::interpolate(const RTCInterpolateArguments* const args) { unsigned int primID = args->primID; float u = args->u; float v = args->v; RTCBufferType bufferType = args->bufferType; unsigned int bufferSlot = args->bufferSlot; float* P = args->P; float* dPdu = args->dPdu; float* dPdv = args->dPdv; float* ddPdudu = args->ddPdudu; float* ddPdvdv = args->ddPdvdv; float* ddPdudv = args->ddPdudv; unsigned int valueCount = args->valueCount; /* calculate base pointer and stride */ assert((bufferType == RTC_BUFFER_TYPE_VERTEX && bufferSlot < numTimeSteps) || (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE && bufferSlot <= vertexAttribs.size())); const char* src = nullptr; size_t stride = 0; if (bufferType == RTC_BUFFER_TYPE_VERTEX_ATTRIBUTE) { src = vertexAttribs[bufferSlot].getPtr(); stride = vertexAttribs[bufferSlot].getStride(); } else { src = vertices[bufferSlot].getPtr(); stride = vertices[bufferSlot].getStride(); } for (unsigned int i=0; i<valueCount; i+=4) { const vbool4 valid = vint4((int)i)+vint4(step) < vint4(int(valueCount)); const size_t ofs = i*sizeof(float); const Quad& tri = quad(primID); const vfloat4 p0 = vfloat4::loadu(valid,(float*)&src[tri.v[0]*stride+ofs]); const vfloat4 p1 = vfloat4::loadu(valid,(float*)&src[tri.v[1]*stride+ofs]); const vfloat4 p2 = vfloat4::loadu(valid,(float*)&src[tri.v[2]*stride+ofs]); const vfloat4 p3 = vfloat4::loadu(valid,(float*)&src[tri.v[3]*stride+ofs]); const vbool4 left = u+v <= 1.0f; const vfloat4 Q0 = select(left,p0,p2); const vfloat4 Q1 = select(left,p1,p3); const vfloat4 Q2 = select(left,p3,p1); const vfloat4 U = select(left,u,vfloat4(1.0f)-u); const vfloat4 V = select(left,v,vfloat4(1.0f)-v); const vfloat4 W = 1.0f-U-V; if (P) { vfloat4::storeu(valid,P+i,madd(W,Q0,madd(U,Q1,V*Q2))); } if (dPdu) { assert(dPdu); vfloat4::storeu(valid,dPdu+i,select(left,Q1-Q0,Q0-Q1)); assert(dPdv); vfloat4::storeu(valid,dPdv+i,select(left,Q2-Q0,Q0-Q2)); } if (ddPdudu) { assert(ddPdudu); vfloat4::storeu(valid,ddPdudu+i,vfloat4(zero)); assert(ddPdvdv); vfloat4::storeu(valid,ddPdvdv+i,vfloat4(zero)); assert(ddPdudv); vfloat4::storeu(valid,ddPdudv+i,vfloat4(zero)); } } }
void TriangleMesh::interpolate(unsigned primID, float u, float v, RTCBufferType buffer, float* P, float* dPdu, float* dPdv, size_t numFloats) { /* test if interpolation is enabled */ #if defined(DEBUG) if ((parent->aflags & RTC_INTERPOLATE) == 0) throw_RTCError(RTC_INVALID_OPERATION,"rtcInterpolate can only get called when RTC_INTERPOLATE is enabled for the scene"); #endif /* calculate base pointer and stride */ assert((buffer >= RTC_VERTEX_BUFFER0 && buffer <= RTC_VERTEX_BUFFER1) || (buffer >= RTC_USER_VERTEX_BUFFER0 && buffer <= RTC_USER_VERTEX_BUFFER1)); const char* src = nullptr; size_t stride = 0; if (buffer >= RTC_USER_VERTEX_BUFFER0) { src = userbuffers[buffer&0xFFFF]->getPtr(); stride = userbuffers[buffer&0xFFFF]->getStride(); } else { src = vertices[buffer&0xFFFF].getPtr(); stride = vertices[buffer&0xFFFF].getStride(); } #if !defined(__MIC__) for (size_t i=0; i<numFloats; i+=4) { size_t ofs = i*sizeof(float); const float w = 1.0f-u-v; const Triangle& tri = triangle(primID); const vfloat4 p0 = vfloat4::loadu((float*)&src[tri.v[0]*stride+ofs]); const vfloat4 p1 = vfloat4::loadu((float*)&src[tri.v[1]*stride+ofs]); const vfloat4 p2 = vfloat4::loadu((float*)&src[tri.v[2]*stride+ofs]); const vbool4 valid = vint4(i)+vint4(step) < vint4(numFloats); if (P ) vfloat4::storeu(valid,P+i,w*p0 + u*p1 + v*p2); if (dPdu) vfloat4::storeu(valid,dPdu+i,p1-p0); if (dPdv) vfloat4::storeu(valid,dPdv+i,p2-p0); } #else for (size_t i=0; i<numFloats; i+=16) { size_t ofs = i*sizeof(float); vbool16 mask = (i+16 > numFloats) ? (vbool16)(((unsigned int)1 << (numFloats-i))-1) : vbool16( true ); const float w = 1.0f-u-v; const Triangle& tri = triangle(primID); const vfloat16 p0 = vfloat16::loadu(mask,(float*)&src[tri.v[0]*stride+ofs]); const vfloat16 p1 = vfloat16::loadu(mask,(float*)&src[tri.v[1]*stride+ofs]); const vfloat16 p2 = vfloat16::loadu(mask,(float*)&src[tri.v[2]*stride+ofs]); if (P ) vfloat16::storeu_compact(mask,P+i,w*p0 + u*p1 + v*p2); if (dPdu) vfloat16::storeu_compact(mask,dPdu+i,p1-p0); if (dPdv) vfloat16::storeu_compact(mask,dPdv+i,p2-p0); } #endif }
void AccelN::occluded8 (const void* valid, Accel::Intersectors* This_in, RTCRay8& ray, IntersectContext* context) { AccelN* This = (AccelN*)This_in->ptr; for (size_t i=0; i<This->validAccels.size(); i++) { This->validAccels[i]->intersectors.occluded8(valid,ray,context); #if defined(__SSE2__) // FIXME: use higher ISA vbool4 valid0 = ((vbool4*)valid)[0]; vbool4 hit0 = ((vint4*)ray.geomID)[0] != vint4(0); vbool4 valid1 = ((vbool4*)valid)[1]; vbool4 hit1 = ((vint4*)ray.geomID)[1] != vint4(0); if (unlikely((none((valid0 & hit0) | (valid1 & hit1))))) break; #endif } }
void SubdivMeshAVX::interpolateN(const void* valid_i, const unsigned* primIDs, const float* u, const float* v, size_t numUVs, RTCBufferType buffer, float* P, float* dPdu, float* dPdv, float* ddPdudu, float* ddPdvdv, float* ddPdudv, size_t numFloats) { #if defined(DEBUG) if ((parent->aflags & RTC_INTERPOLATE) == 0) throw_RTCError(RTC_INVALID_OPERATION,"rtcInterpolate can only get called when RTC_INTERPOLATE is enabled for the scene"); #endif const int* valid = (const int*) valid_i; for (size_t i=0; i<numUVs;) { if (i+4 >= numUVs) { vbool4 valid1 = vint4(int(i))+vint4(step) < vint4(numUVs); if (valid) valid1 &= vint4::loadu(&valid[i]) == vint4(-1); if (none(valid1)) { i+=4; continue; } interpolateHelper(valid1,vint4::loadu(&primIDs[i]),vfloat4::loadu(&u[i]),vfloat4::loadu(&v[i]),numUVs,buffer, P ? P+i : nullptr, dPdu ? dPdu+i : nullptr, dPdv ? dPdv+i : nullptr, ddPdudu ? ddPdudu+i : nullptr, ddPdvdv ? ddPdvdv+i : nullptr, ddPdudv ? ddPdudv+i : nullptr, numFloats); i+=4; } else { vbool8 valid1 = vint8(int(i))+vint8(step) < vint8(int(numUVs)); if (valid) valid1 &= vint8::loadu(&valid[i]) == vint8(-1); if (none(valid1)) { i+=8; continue; } interpolateHelper(valid1,vint8::loadu(&primIDs[i]),vfloat8::loadu(&u[i]),vfloat8::loadu(&v[i]),numUVs,buffer, P ? P+i : nullptr, dPdu ? dPdu+i : nullptr, dPdv ? dPdv+i : nullptr, ddPdudu ? ddPdudu+i : nullptr, ddPdvdv ? ddPdvdv+i : nullptr, ddPdudv ? ddPdudv+i : nullptr, numFloats); i+=8; } } AVX_ZERO_UPPER(); }
size_t BVHNRotate<4>::rotate(NodeRef parentRef, size_t depth) { /*! nothing to rotate if we reached a leaf node. */ if (parentRef.isBarrier()) return 0; if (parentRef.isLeaf()) return 0; Node* parent = parentRef.node(); /*! rotate all children first */ vint4 cdepth; for (size_t c=0; c<4; c++) cdepth[c] = (int)rotate(parent->child(c),depth+1); /* compute current areas of all children */ vfloat4 sizeX = parent->upper_x-parent->lower_x; vfloat4 sizeY = parent->upper_y-parent->lower_y; vfloat4 sizeZ = parent->upper_z-parent->lower_z; vfloat4 childArea = sizeX*(sizeY + sizeZ) + sizeY*sizeZ; /*! get node bounds */ BBox<vfloat4> child1_0,child1_1,child1_2,child1_3; parent->bounds(child1_0,child1_1,child1_2,child1_3); /*! Find best rotation. We pick a first child (child1) and a sub-child (child2child) of a different second child (child2), and swap child1 and child2child. We perform the best such swap. */ float bestArea = 0; size_t bestChild1 = -1, bestChild2 = -1, bestChild2Child = -1; for (size_t c2=0; c2<4; c2++) { /*! ignore leaf nodes as we cannot descent into them */ if (parent->child(c2).isBarrier()) continue; if (parent->child(c2).isLeaf()) continue; Node* child2 = parent->child(c2).node(); /*! transpose child bounds */ BBox<vfloat4> child2c0,child2c1,child2c2,child2c3; child2->bounds(child2c0,child2c1,child2c2,child2c3); /*! put child1_0 at each child2 position */ float cost00 = halfArea3f(merge(child1_0,child2c1,child2c2,child2c3)); float cost01 = halfArea3f(merge(child2c0,child1_0,child2c2,child2c3)); float cost02 = halfArea3f(merge(child2c0,child2c1,child1_0,child2c3)); float cost03 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_0)); vfloat4 cost0 = vfloat4(cost00,cost01,cost02,cost03); vfloat4 min0 = vreduce_min(cost0); int pos0 = (int)__bsf(movemask(min0 == cost0)); /*! put child1_1 at each child2 position */ float cost10 = halfArea3f(merge(child1_1,child2c1,child2c2,child2c3)); float cost11 = halfArea3f(merge(child2c0,child1_1,child2c2,child2c3)); float cost12 = halfArea3f(merge(child2c0,child2c1,child1_1,child2c3)); float cost13 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_1)); vfloat4 cost1 = vfloat4(cost10,cost11,cost12,cost13); vfloat4 min1 = vreduce_min(cost1); int pos1 = (int)__bsf(movemask(min1 == cost1)); /*! put child1_2 at each child2 position */ float cost20 = halfArea3f(merge(child1_2,child2c1,child2c2,child2c3)); float cost21 = halfArea3f(merge(child2c0,child1_2,child2c2,child2c3)); float cost22 = halfArea3f(merge(child2c0,child2c1,child1_2,child2c3)); float cost23 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_2)); vfloat4 cost2 = vfloat4(cost20,cost21,cost22,cost23); vfloat4 min2 = vreduce_min(cost2); int pos2 = (int)__bsf(movemask(min2 == cost2)); /*! put child1_3 at each child2 position */ float cost30 = halfArea3f(merge(child1_3,child2c1,child2c2,child2c3)); float cost31 = halfArea3f(merge(child2c0,child1_3,child2c2,child2c3)); float cost32 = halfArea3f(merge(child2c0,child2c1,child1_3,child2c3)); float cost33 = halfArea3f(merge(child2c0,child2c1,child2c2,child1_3)); vfloat4 cost3 = vfloat4(cost30,cost31,cost32,cost33); vfloat4 min3 = vreduce_min(cost3); int pos3 = (int)__bsf(movemask(min3 == cost3)); /*! find best other child */ vfloat4 area0123 = vfloat4(extract<0>(min0),extract<0>(min1),extract<0>(min2),extract<0>(min3)) - vfloat4(childArea[c2]); int pos[4] = { pos0,pos1,pos2,pos3 }; const size_t mbd = BVH4::maxBuildDepth; vbool4 valid = vint4(int(depth+1))+cdepth <= vint4(mbd); // only select swaps that fulfill depth constraints valid &= vint4(c2) != vint4(step); if (none(valid)) continue; size_t c1 = select_min(valid,area0123); float area = area0123[c1]; if (c1 == c2) continue; // can happen if bounds are NANs /*! accept a swap when it reduces cost and is not swapping a node with itself */ if (area < bestArea) { bestArea = area; bestChild1 = c1; bestChild2 = c2; bestChild2Child = pos[c1]; } } /*! if we did not find a swap that improves the SAH then do nothing */ if (bestChild1 == size_t(-1)) return 1+reduce_max(cdepth); /*! perform the best found tree rotation */ Node* child2 = parent->child(bestChild2).node(); BVH4::swap(parent,bestChild1,child2,bestChild2Child); parent->set(bestChild2,child2->bounds()); BVH4::compact(parent); BVH4::compact(child2); /*! This returned depth is conservative as the child that was * pulled up in the tree could have been on the critical path. */ cdepth[bestChild1]++; // bestChild1 was pushed down one level return 1+reduce_max(cdepth); }