 BBox3fa SceneTriangle8::update(char* prim, size_t num, void* geom) const 
   BBox3fa bounds = empty;
   Scene* scene = (Scene*) geom;
   for (size_t j=0; j<num; j++) 
     Triangle8& dst = ((Triangle8*) prim)[j];
     avxi vgeomID = -1, vprimID = -1, vmask = -1;
     avx3f v0 = zero, v1 = zero, v2 = zero;
     for (size_t i=0; i<8; i++)
       if (dst.primID[i] == -1) break;
       const unsigned geomID = dst.geomID[i];
       const unsigned primID = dst.primID[i];
       const TriangleMesh* mesh = scene->getTriangleMesh(geomID);
       const TriangleMesh::Triangle& tri = mesh->triangle(primID);
       const Vec3fa p0 = mesh->vertex(tri.v[0]);
       const Vec3fa p1 = mesh->vertex(tri.v[1]);
       const Vec3fa p2 = mesh->vertex(tri.v[2]);
       vgeomID [i] = geomID;
       vprimID [i] = primID;
       vmask   [i] = mesh->mask;
       v0.x[i] = p0.x; v0.y[i] = p0.y; v0.z[i] = p0.z;
       v1.x[i] = p1.x; v1.y[i] = p1.y; v1.z[i] = p1.z;
       v2.x[i] = p2.x; v2.y[i] = p2.y; v2.z[i] = p2.z;
     new (&dst) Triangle8(v0,v1,v2,vgeomID,vprimID,vmask);
   return bounds; 
  PrimRefGen<Heuristic>::PrimRefGen(size_t threadIndex, size_t threadCount, const BuildSource* geom, PrimRefAlloc* alloc)
    : geom(geom), numPrimitives(0), numVertices(0), alloc(alloc)
    /* compute number of primitives */
    size_t numGroups = geom->groups();
    for (size_t g=0; g<numGroups; g++) {
      size_t vertices = 0;
      numPrimitives += geom->prims(g,&vertices);
      numVertices += vertices;

    /* approximate bounds */
    BBox3fa geomBound = empty, centBound = empty;
    size_t s = 0, t = 0, dt = max(size_t(1),numPrimitives/2048);
    for (size_t g=0; g<numGroups; g++) 
      size_t numPrims = geom->prims(g);
      for (size_t i=t-s; i<numPrims; i+=dt, t+=dt) {
        BBox3fa bounds = geom->bounds(g,i);
      s += numPrims;
    new (&pinfo) PrimInfo(numPrimitives,geomBound,centBound);

    /* compute start group and primitives */
    size_t g=0, i=0;
    for (size_t k=0; k<numTasks; k++) 
      size_t start = (k+0)*numPrimitives/numTasks;
      size_t end   = (k+1)*numPrimitives/numTasks;
      size_t size  = end-start;
      work[k].startGroup = g;
      work[k].startPrim = i;
      work[k].numPrims = size;

      for (; g<numGroups; g++) 
        size_t numPrims = geom->prims(g)-i;
        if (size < numPrims) {
          i += size;
        size -= numPrims;
        i = 0;

    /* start parallel task */
    __forceinline SpatialSplit::Split SpatialSplit::BinInfo::best(const PrimInfo& pinfo, const Mapping& mapping, const size_t blocks_shift)
      /* sweep from right to left and compute parallel prefix of merged bounds */
      ssef rAreas[BINS];
      ssei rCounts[BINS];
      ssei count = 0; BBox3fa bx = empty; BBox3fa by = empty; BBox3fa bz = empty;
      for (size_t i=BINS-1; i>0; i--)
	count += numEnd[i];
	rCounts[i] = count;
	bx.extend(bounds[i][0]); rAreas[i][0] = halfArea(bx);
	by.extend(bounds[i][1]); rAreas[i][1] = halfArea(by);
	bz.extend(bounds[i][2]); rAreas[i][2] = halfArea(bz);
      /* sweep from left to right and compute SAH */
      ssei blocks_add = (1 << blocks_shift)-1;
      ssei ii = 1; ssef vbestSAH = pos_inf; ssei vbestPos = 0;
      count = 0; bx = empty; by = empty; bz = empty;
      for (size_t i=1; i<BINS; i++, ii+=1)
	count += numBegin[i-1];
	bx.extend(bounds[i-1][0]); float Ax = halfArea(bx);
	by.extend(bounds[i-1][1]); float Ay = halfArea(by);
	bz.extend(bounds[i-1][2]); float Az = halfArea(bz);
	const ssef lArea = ssef(Ax,Ay,Az,Az);
	const ssef rArea = rAreas[i];
	const ssei lCount = (count     +blocks_add) >> blocks_shift;
	const ssei rCount = (rCounts[i]+blocks_add) >> blocks_shift;
	const ssef sah = lArea*ssef(lCount) + rArea*ssef(rCount);
	vbestPos  = select(sah < vbestSAH,ii ,vbestPos);
	vbestSAH  = select(sah < vbestSAH,sah,vbestSAH);
      /* find best dimension */
      float bestSAH = inf;
      int   bestDim = -1;
      int   bestPos = 0;
      for (size_t dim=0; dim<3; dim++) 
	/* ignore zero sized dimensions */
	if (unlikely(mapping.invalid(dim)))

	/* test if this is a better dimension */
	if (vbestSAH[dim] < bestSAH && vbestPos[dim] != 0) {
	  bestDim = dim;
	  bestPos = vbestPos[dim];
	  bestSAH = vbestSAH[dim];
      /* return invalid split if no split found */
      if (bestDim == -1) 
	return Split(inf,-1,0,mapping);
      /* return best found split */
      return Split(bestSAH,bestDim,bestPos,mapping);
 BBox3fa TriangleMeshTriangle1v::update(char* prim, size_t num, void* geom) const 
   BBox3fa bounds = empty;
   const TriangleMesh* mesh = (const TriangleMesh*) geom;
   for (size_t j=0; j<num; j++) 
     Triangle1v& dst = ((Triangle1v*) prim)[j];
     const unsigned geomID = dst.geomID();
     const unsigned primID = dst.primID();
     const TriangleMesh::Triangle& tri = mesh->triangle(primID);
     const Vec3fa v0 = mesh->vertex(tri.v[0]);
     const Vec3fa v1 = mesh->vertex(tri.v[1]);
     const Vec3fa v2 = mesh->vertex(tri.v[2]);
     new (&dst) Triangle1v(v0,v1,v2,geomID,primID,mesh->mask);
   return bounds; 
    const StrandSplit::Split StrandSplit::find<false>(size_t threadIndex, size_t threadCount, LockStepTaskScheduler* scheduler, BezierRefList& prims)
      /* first curve determines first axis */
      BezierRefList::block_iterator_unsafe i = prims;
      Vec3fa axis0 = normalize(i->p3 - i->p0);
      /* find 2nd axis that is most misaligned with first axis */
      float bestCos = 1.0f;
      Vec3fa axis1 = axis0;
      for (; i; i++) {
	Vec3fa axisi = i->p3 - i->p0;
	float leni = length(axisi);
	if (leni == 0.0f) continue;
	axisi /= leni;
	float cos = abs(dot(axisi,axis0));
	if (cos < bestCos) { bestCos = cos; axis1 = axisi; }
      /* partition the two strands */
      size_t lnum = 0, rnum = 0;
      BBox3fa lbounds = empty, rbounds = empty;
      const LinearSpace3fa space0 = frame(axis0).transposed();
      const LinearSpace3fa space1 = frame(axis1).transposed();
      for (BezierRefList::block_iterator_unsafe i = prims; i; i++) 
	BezierPrim& prim = *i;
	const Vec3fa axisi = normalize(prim.p3-prim.p0);
	const float cos0 = abs(dot(axisi,axis0));
	const float cos1 = abs(dot(axisi,axis1));
	if (cos0 > cos1) { lnum++; lbounds.extend(prim.bounds(space0)); }
	else             { rnum++; rbounds.extend(prim.bounds(space1)); }
      /*! return an invalid split if we do not partition */
      if (lnum == 0 || rnum == 0) 
	return Split(inf,axis0,axis1);
      /*! calculate sah for the split */
      const float sah = float(lnum)*halfArea(lbounds) + float(rnum)*halfArea(rbounds);
      return Split(sah,axis0,axis1);
  BBox3fa TriangleMeshTriangle1v::update(char* prim_i, size_t num, void* geom) const 
    BBox3fa bounds = empty;
    const TriangleMesh* mesh = (const TriangleMesh*) geom;
    Triangle1v* prim = (Triangle1v*) prim_i;

    if (num == -1)
      while (true)
	const unsigned geomID = prim->geomID<1>();
	const unsigned primID = prim->primID<1>();
	const TriangleMesh::Triangle& tri = mesh->triangle(primID);
	const Vec3fa v0 = mesh->vertex(tri.v[0]);
	const Vec3fa v1 = mesh->vertex(tri.v[1]);
	const Vec3fa v2 = mesh->vertex(tri.v[2]);
	const bool last = prim->last();
	new (prim) Triangle1v(v0,v1,v2,geomID,primID,mesh->mask,last);
	if (last) break;
      for (size_t i=0; i<num; i++, prim++)
	const unsigned geomID = prim->geomID<0>();
	const unsigned primID = prim->primID<0>();
	const TriangleMesh::Triangle& tri = mesh->triangle(primID);
	const Vec3fa v0 = mesh->vertex(tri.v[0]);
	const Vec3fa v1 = mesh->vertex(tri.v[1]);
	const Vec3fa v2 = mesh->vertex(tri.v[2]);
	new (prim) Triangle1v(v0,v1,v2,geomID,primID,mesh->mask,false);
    return bounds; 
  void PrimRefGen<Heuristic>::task_gen_parallel(size_t threadIndex, size_t threadCount, size_t taskIndex, size_t taskCount, TaskScheduler::Event* event) 
    Heuristic& heuristic = heuristics[taskIndex];
    new (&heuristic) Heuristic(pinfo,geom);
    /* static work allocation */
    size_t g = work[taskIndex].startGroup;
    size_t i = work[taskIndex].startPrim;
    size_t numPrims = work[taskIndex].numPrims;
    size_t numGroupPrims = numPrims ? geom->prims(g) : 0;
    size_t numAddedPrims = 0;
    BBox3fa geomBound = empty, centBound = empty;
    typename atomic_set<PrimRefBlock>::item* block = prims.insert(alloc->malloc(threadIndex)); 
    for (size_t p=0; p<numPrims; p++, i++)
      /* goto next group */
      while (i == numGroupPrims) {
        g++; i = 0;
        numGroupPrims = geom->prims(g);

      const BBox3fa b = geom->bounds(g,i);
      if (b.empty()) continue;
      const PrimRef prim = PrimRef(b,g,i);
      if (likely(block->insert(prim))) continue; 
      block = prims.insert(alloc->malloc(threadIndex));
    geomBounds[taskIndex] = geomBound;
    centBounds[taskIndex] = centBound;
    work[taskIndex].numPrims = numAddedPrims;
    void StrandSplit::TaskFindParallel::task_bound_parallel(size_t threadIndex, size_t threadCount, size_t taskIndex, size_t taskCount) 
      const LinearSpace3fa space0 = frame(axis0).transposed();
      const LinearSpace3fa space1 = frame(axis1).transposed();
      size_t lnum = 0; BBox3fa lbounds = empty;
      size_t rnum = 0; BBox3fa rbounds = empty;
      while (BezierRefList::item* block = iter1.next()) 
	for (size_t i=0; i<block->size(); i++) 
	  BezierPrim& prim = block->at(i);
	  const Vec3fa axisi = normalize(prim.p3-prim.p0);
	  const float cos0 = abs(dot(axisi,axis0));
	  const float cos1 = abs(dot(axisi,axis1));
	  if (cos0 > cos1) { lnum++; lbounds.extend(prim.bounds(space0)); }
	  else             { rnum++; rbounds.extend(prim.bounds(space1)); }
      task_lnum[taskIndex] = lnum; task_lbounds[taskIndex] = lbounds;
      task_rnum[taskIndex] = rnum; task_rbounds[taskIndex] = rbounds;
    StrandSplit::TaskFindParallel::TaskFindParallel(size_t threadIndex, size_t threadCount, LockStepTaskScheduler* scheduler, BezierRefList& prims)
      /* first curve determines first axis */
      BezierRefList::block_iterator_unsafe i = prims;
      axis0 = axis1 = normalize(i->p3 - i->p0);
      /* parallel calculation of 2nd axis  */
      size_t numTasks = min(maxTasks,threadCount);
      /* select best 2nd axis */
      float bestCos = 1.0f;
      for (size_t i=0; i<numTasks; i++) {
	if (task_cos[i] < bestCos) { bestCos = task_cos[i]; axis1 = task_axis1[i]; }
      /* parallel calculation of unaligned bounds */
      /* reduce bounds calculates by tasks */
      size_t lnum = 0; BBox3fa lbounds = empty;
      size_t rnum = 0; BBox3fa rbounds = empty;
      for (size_t i=0; i<numTasks; i++) {
	lnum += task_lnum[i]; lbounds.extend(task_lbounds[i]);
	rnum += task_rnum[i]; rbounds.extend(task_rbounds[i]);
      /*! return an invalid split if we do not partition */
      if (lnum == 0 || rnum == 0) {
	split = Split(inf,axis0,axis1); 
      /*! calculate sah for the split */
      const float sah = float(lnum)*halfArea(lbounds) + float(lnum)*halfArea(rbounds);
      split = Split(sah,axis0,axis1);
文件: bvh4i.cpp 项目: cpaalman/embree
  float BVH4i::sah (NodeRef& node, const BBox3fa& bounds)
    float f = bounds.empty() ? 0.0f : area(bounds);

    if (node.isNode()) 
      Node* n = node.node(nodePtr());
      for (size_t c=0; c<4; c++) 
        f += sah(n->child(c),n->bounds(c));
      return f;
      size_t num; node.leaf(triPtr(),num);
      return f*num;
  float BVH4i::sah (NodeRef& node, BBox3fa bounds)
    float f = bounds.empty() ? 0.0f : area(bounds);

    if (node.isNode()) 
      Node* n = node.node(nodePtr());
      for (size_t c=0; c<BVH4i::N; c++) 
	if (n->child(c) != BVH4i::invalidNode)
	  f += sah(n->child(c),n->bounds(c));
      return f;
      unsigned int num; node.leaf(triPtr(),num);
      return f*num;
  void BVH4Statistics::statistics(NodeRef node, const BBox3fa& bounds, size_t& depth)
    float A = bounds.empty() ? 0.0f : area(bounds);

    if (node.isNode())
      depth = 0;
      size_t cdepth = 0;
      Node* n = node.node();
      bvhSAH += A*BVH4::travCost;
      for (size_t i=0; i<BVH4::N; i++) {
      for (size_t i=0; i<BVH4::N; i++) {
        if (n->child(i) == BVH4::emptyNode) {
          for (; i<BVH4::N; i++) {
            if (n->child(i) != BVH4::emptyNode)
              throw std::runtime_error("invalid node");
      depth = 0;
      size_t num; const char* tri = node.leaf(num);
      if (!num) return;
      numPrimBlocks += num;
      for (size_t i=0; i<num; i++) {
        numPrims += bvh->primTy.size(tri+i*bvh->primTy.bytes);
      float sah = A * bvh->primTy.intCost * num;
      bvhSAH += sah;
      leafSAH += sah;
void build_morton(vector_t<PrimRef>& prims, isa::PrimInfo& pinfo)
  size_t N = pinfo.size();
  /* array for morton builder */
  vector_t<isa::MortonID32Bit> morton_src(N);
  vector_t<isa::MortonID32Bit> morton_tmp(N);
  for (size_t i=0; i<N; i++) 
    morton_src[i].index = i;

  /* fast allocator that supports thread local operation */
  FastAllocator allocator;

  for (size_t i=0; i<2; i++)
    std::cout << "iteration " << i << ": building BVH over " << N << " primitives, " << std::flush;
    double t0 = getSeconds();

    std::pair<Node*,BBox3fa> node_bounds = isa::bvh_builder_morton<Node*>(

      /* thread local allocator for fast allocations */
      [&] () -> FastAllocator::ThreadLocal* { 
        return allocator.threadLocal(); 


      /* lambda function that allocates BVH nodes */
      [&] ( isa::MortonBuildRecord<Node*>& current, isa::MortonBuildRecord<Node*>* children, size_t N, FastAllocator::ThreadLocal* alloc ) -> InnerNode*
        assert(N <= 2);
        InnerNode* node = new (alloc->malloc(sizeof(InnerNode))) InnerNode;
        *current.parent = node;
        for (size_t i=0; i<N; i++) 
          children[i].parent = &node->children[i];
        return node;

      /* lambda function that sets bounds */
      [&] (InnerNode* node, const BBox3fa* bounds, size_t N) -> BBox3fa
        BBox3fa res = empty;
        for (size_t i=0; i<N; i++) {
          const BBox3fa b = bounds[i];
          node->bounds[i] = b;
        return res;

      /* lambda function that creates BVH leaves */
      [&]( isa::MortonBuildRecord<Node*>& current, FastAllocator::ThreadLocal* alloc, BBox3fa& box_o) -> Node*
        assert(current.size() == 1);
        const size_t id = morton_src[current.begin].index;
        const BBox3fa bounds = prims[id].bounds(); // FIXME: dont use morton_src, should be input
        Node* node = new (alloc->malloc(sizeof(LeafNode))) LeafNode(id,bounds);
        *current.parent = node;
        box_o = bounds;
        return node;

      /* lambda that calculates the bounds for some primitive */
      [&] (const isa::MortonID32Bit& morton) -> BBox3fa {
        return prims[morton.index].bounds();

      /* progress monitor function */
      [&] (size_t dn) { 
        // throw an exception here to cancel the build operation


    Node* root = node_bounds.first;
    double t1 = getSeconds();

    std::cout << 1000.0f*(t1-t0) << "ms, " << 1E-6*double(N)/(t1-t0) << " Mprims/s, sah = " << root->sah() << " [DONE]" << std::endl;