示例#1
0
文件: allfuns.c 项目: mahdiz/mpclib
int
main (int argc, char *argv[])
{
  MINT *a, *b, *c, *d;
  short  h;

  mp_set_memory_functions (NULL, NULL, NULL);
  a = itom (123);
  b = xtom ("DEADBEEF");
  c = itom (0);
  d = itom (0);
  move (a, b);
  madd (a, b, c);
  msub (a, b, c);
  mult (a, b, c);
  mdiv (b, a, c, d);
  sdiv (b, 2, c, &h);
  msqrt (a, c, d);
  pow (b, a, a, c);
  rpow (a, 3, c);
  gcd (a, b, c);
  mcmp (a, b);
  if (argc > 1)
    {
      min (c);
      mout (a);
    }
  mtox (b);
  mfree(a);

  exit (0);
}
示例#2
0
void
qsub(void)
{
	unsigned int *a, *ab, *b, *ba, *c;

	save();

	p2 = pop();
	p1 = pop();

	ab = mmul(p1->u.q.a, p2->u.q.b);
	ba = mmul(p1->u.q.b, p2->u.q.a);

	a = msub(ab, ba);

	mfree(ab);
	mfree(ba);

	// zero?

	if (MZERO(a)) {
		mfree(a);
		push(zero);
		restore();
		return;
	}

	b = mmul(p1->u.q.b, p2->u.q.b);

	c = mgcd(a, b);

	MSIGN(c) = MSIGN(b);

	p1 = alloc();

	p1->k = NUM;

	p1->u.q.a = mdiv(a, c);
	p1->u.q.b = mdiv(b, c);

	mfree(a);
	mfree(b);
	mfree(c);

	push(p1);

	restore();
}
示例#3
0
文件: Mgcd.c 项目: 8l/csolve
FN minvert(MINT *a, MINT *b, MINT *c)
{	MINT x, y, z, w, Anew, Aold;
	int i = 0;
	static MINT one;
	static int oneinit = 1;

	if (oneinit) {
		oneinit = 0;
		MSET(1,&one);
	}
	MINIT(&x);
	MINIT(&y);
	MINIT(&z);
	MINIT(&w);
	MINIT(&Aold);
	MSET (1,&Anew);

	mcopy(b, &x);
	mcopy(a, &y);
	/*
	 * Loop invariant:
	 *
	 * y = -1^i * Anew * a  mod b
	 */
	while(mtest(&y) != 0)
	{	mdiv(&x, &y, &w, &z);
		mcopy(&Anew, &x);
		mmult(&w, &Anew, &Anew);
		madd(&Anew, &Aold, &Anew);
		mmove(&x, &Aold);
		mmove(&y, &x);
		mmove(&z, &y);
		i++;
	}
	if (mcmp(&one,&x)) {
		mcopy(&one,c);
	} else {
		mmove(&Aold, c);
		if( (i&01) == 0) msub(b, c, c);
	}

	MFREE(&x);
	MFREE(&y);
	MFREE(&z);
	MFREE(&w);
	MFREE(&Aold);
	MFREE(&Anew);
}
示例#4
0
void newton_fd(double (*funcpt)(double *,int),double *xi,int N,double *x) {
	int it,i;
	double *jac,*hess,*L,*xf;
	double stop,result;
	
	jac = (double*) malloc(sizeof(double) *N);
	xf = (double*) malloc(sizeof(double) *N);
	hess = (double*) malloc(sizeof(double) *N * N);
	L = (double*) malloc(sizeof(double) *N * N);
	
	it = 0;
	stop = 1.0;
	
	while (stop > 1e-05 && it < 100) {
		//result = funcpt(xi,N,jac,hess);
		jacobian_fd(funcpt,xi,N,jac);
		hessian_fd(funcpt,xi,N,hess);
		//mdisplay(jac,1,N);
		//mdisplay(hess,N,N);
		modelhess2(hess,N,L);
		//mdisplay(L,N,N);
		scale(jac,1,N,-1.0);
		linsolve_lower(L,N,jac,x);
		madd(x,xi,x,1,N);
		msub(x,xi,xf,1,N);
		stop = array_max_abs(xf,N);
		for(i=0;i < N;++i) {
			xi[i] = x[i];
		}
		it++;
		//printf("result %lf \n",result);
		printf("Values obtained %lf , %lf after %d Iterations \n",x[0],x[1],it);
	}
	
	free(jac);
	free(hess);
	free(xf);
	free(L);
}
示例#5
0
文件: Msqrt.c 项目: 8l/csolve
msqrt(MINT *a, MINT *b, MINT *r)
{	MINT x,y,z;
	register alen,j;

	MINIT(&x); MINIT(&y); MINIT(&z);
	alen = a->len;

	if (alen<0) mpfatal("msqrt: neg arg");
	if (alen==0) {
		mset(0,b);
		mset(0,r);
		return(0);
	}

	if(alen & 01) x.len = (1+alen)/2;
	else x.len = 1 + alen/2;
	valloc(x.val,x.len);
	for (j=x.len; (--j)>=0;) x.val[j]=0;
	if (alen & 01) x.val[x.len-1]=0400;
	else x.val[x.len-1]=1;

	for (;;) {
		mdiv(a,&x,&y,&z);
		madd(&x,&y,&y);
		mshiftr(&y,1);
		if (mcmp(&x,&y) <= 0) break;
		mmove(&y,&x);
	}
	mcopy(&x,&y);
	mmult(&x,&x,&x);
	msub(a,&x,r);
	MFREE(&x);
	MMOVEFREE(&y,b);
	MFREE(&z);
	return(r->len);
}
    void BVH4Intersector4Chunk<PrimitiveIntersector4>::intersect(sseb* valid_i, BVH4* bvh, Ray4& ray)
    {
      /* load ray */
      const sseb valid0 = *valid_i;
      const sse3f rdir = rcp_safe(ray.dir);
      const sse3f org(ray.org), org_rdir = org * rdir;
      ssef ray_tnear = select(valid0,ray.tnear,ssef(pos_inf));
      ssef ray_tfar  = select(valid0,ray.tfar ,ssef(neg_inf));
      const ssef inf = ssef(pos_inf);
      Precalculations pre(valid0,ray);
      
      /* allocate stack and push root node */
      ssef    stack_near[stackSize];
      NodeRef stack_node[stackSize];
      stack_node[0] = BVH4::invalidNode;
      stack_near[0] = inf;
      stack_node[1] = bvh->root;
      stack_near[1] = ray_tnear; 
      NodeRef* stackEnd = stack_node+stackSize;
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      ssef*    __restrict__ sptr_near = stack_near + 2;
      
      while (1)
      {
        /* pop next node from stack */
        assert(sptr_node > stack_node);
        sptr_node--;
        sptr_near--;
        NodeRef curNode = *sptr_node;
        if (unlikely(curNode == BVH4::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }
        
        /* cull node if behind closest hit point */
        ssef curDist = *sptr_near;
        if (unlikely(none(ray_tfar > curDist))) 
          continue;
        
        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf()))
            break;
          
          const sseb valid_node = ray_tfar > curDist;
          STAT3(normal.trav_nodes,1,popcnt(valid_node),4);
          const Node* __restrict__ const node = curNode.node();
          
          /* pop of next node */
          assert(sptr_node > stack_node);
          sptr_node--;
          sptr_near--;
          curNode = *sptr_node;
          curDist = *sptr_near;
          
#pragma unroll(4)
          for (unsigned i=0; i<BVH4::N; i++)
          {
            const NodeRef child = node->children[i];
            if (unlikely(child == BVH4::emptyNode)) break;
            
#if defined(__AVX2__)
            const ssef lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x);
            const ssef lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y);
            const ssef lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z);
            const ssef lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x);
            const ssef lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y);
            const ssef lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z);
#else
            const ssef lclipMinX = (node->lower_x[i] - org.x) * rdir.x;
            const ssef lclipMinY = (node->lower_y[i] - org.y) * rdir.y;
            const ssef lclipMinZ = (node->lower_z[i] - org.z) * rdir.z;
            const ssef lclipMaxX = (node->upper_x[i] - org.x) * rdir.x;
            const ssef lclipMaxY = (node->upper_y[i] - org.y) * rdir.y;
            const ssef lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z;
#endif

#if defined(__SSE4_1__)
            const ssef lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
            const ssef lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
            const sseb lhit   = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar);      
#else
            const ssef lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const ssef lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const sseb lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);      
#endif
            
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              assert(sptr_node < stackEnd);
              const ssef childDist = select(lhit,lnearP,inf);
              const NodeRef child = node->children[i];
              assert(child != BVH4::emptyNode);
              sptr_node++;
              sptr_near++;
              
              /* push cur node onto stack and continue with hit child */
              if (any(childDist < curDist))
              {
                *(sptr_node-1) = curNode;
                *(sptr_near-1) = curDist; 
                curDist = childDist;
                curNode = child;
              }
              
              /* push hit child onto stack */
              else {
                *(sptr_node-1) = child;
                *(sptr_near-1) = childDist; 
              }
            }	      
          }
        }
        
        /* return if stack is empty */
        if (unlikely(curNode == BVH4::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }
        
        /* intersect leaf */
        const sseb valid_leaf = ray_tfar > curDist;
        STAT3(normal.trav_leaves,1,popcnt(valid_leaf),4);
        size_t items; const Primitive* prim = (Primitive*) curNode.leaf(items);
        PrimitiveIntersector4::intersect(valid_leaf,pre,ray,prim,items,bvh->geometry);
        ray_tfar = select(valid_leaf,ray.tfar,ray_tfar);
      }
      AVX_ZERO_UPPER();
    }
示例#7
0
static void test_heart() {
    printf("run test heart\n");
    assert(madd(5, 6) == 11);
    assert(msub(1024, 512) == 512);
}
    void BVH8Intersector8Hybrid<PrimitiveIntersector8>::occluded(bool8* valid_i, BVH8* bvh, Ray8& ray)
    {
      /* load ray */
      const bool8 valid = *valid_i;
      bool8 terminated = !valid;
      Vec3f8 ray_org = ray.org, ray_dir = ray.dir;
      float8 ray_tnear = ray.tnear, ray_tfar  = ray.tfar;
      const Vec3f8 rdir = rcp_safe(ray_dir);
      const Vec3f8 org(ray_org), org_rdir = org * rdir;
      ray_tnear = select(valid,ray_tnear,float8(pos_inf));
      ray_tfar  = select(valid,ray_tfar ,float8(neg_inf));
      const float8 inf = float8(pos_inf);
      Precalculations pre(valid,ray);

      /* compute near/far per ray */
      Vec3i8 nearXYZ;
      nearXYZ.x = select(rdir.x >= 0.0f,int8(0*(int)sizeof(float8)),int8(1*(int)sizeof(float8)));
      nearXYZ.y = select(rdir.y >= 0.0f,int8(2*(int)sizeof(float8)),int8(3*(int)sizeof(float8)));
      nearXYZ.z = select(rdir.z >= 0.0f,int8(4*(int)sizeof(float8)),int8(5*(int)sizeof(float8)));

      /* allocate stack and push root node */
      float8    stack_near[stackSizeChunk];
      NodeRef stack_node[stackSizeChunk];
      stack_node[0] = BVH8::invalidNode;
      stack_near[0] = inf;
      stack_node[1] = bvh->root;
      stack_near[1] = ray_tnear; 
      NodeRef* stackEnd = stack_node+stackSizeChunk;
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      float8*    __restrict__ sptr_near = stack_near + 2;

      while (1)
      {
        /* pop next node from stack */
        assert(sptr_node > stack_node);
        sptr_node--;
        sptr_near--;
        NodeRef cur = *sptr_node;
        if (unlikely(cur == BVH8::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }

        /* cull node if behind closest hit point */
        float8 curDist = *sptr_near;
        const bool8 active = curDist < ray_tfar;
        if (unlikely(none(active))) 
          continue;
        
        /* switch to single ray traversal */
#if !defined(__WIN32__) || defined(__X86_64__)
        size_t bits = movemask(active);
        if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) {
          for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) {
            if (occluded1(bvh,cur,i,pre,ray,ray_org,ray_dir,rdir,ray_tnear,ray_tfar,nearXYZ))
              terminated[i] = -1;
          }
          if (all(terminated)) break;
          ray_tfar = select(terminated,float8(neg_inf),ray_tfar);
          continue;
        }
#endif
                
        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(cur.isLeaf()))
            break;
          
          const bool8 valid_node = ray_tfar > curDist;
          STAT3(shadow.trav_nodes,1,popcnt(valid_node),8);
          const Node* __restrict__ const node = (Node*)cur.node();
          
          /* pop of next node */
          assert(sptr_node > stack_node);
          sptr_node--;
          sptr_near--;
          cur = *sptr_node;
          curDist = *sptr_near;
          
          for (unsigned i=0; i<BVH8::N; i++)
          {
            const NodeRef child = node->children[i];
            if (unlikely(child == BVH8::emptyNode)) break;
            
#if defined(__AVX2__)
            const float8 lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x);
            const float8 lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y);
            const float8 lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z);
            const float8 lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x);
            const float8 lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y);
            const float8 lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z);
            const float8 lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
            const float8 lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
            const bool8 lhit   = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar);      
#else
            const float8 lclipMinX = (node->lower_x[i] - org.x) * rdir.x;
            const float8 lclipMinY = (node->lower_y[i] - org.y) * rdir.y;
            const float8 lclipMinZ = (node->lower_z[i] - org.z) * rdir.z;
            const float8 lclipMaxX = (node->upper_x[i] - org.x) * rdir.x;
            const float8 lclipMaxY = (node->upper_y[i] - org.y) * rdir.y;
            const float8 lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z;
            const float8 lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const float8 lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const bool8 lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);      
#endif
            
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              assert(sptr_node < stackEnd);
              assert(child != BVH8::emptyNode);
              const float8 childDist = select(lhit,lnearP,inf);
              sptr_node++;
              sptr_near++;
              
              /* push cur node onto stack and continue with hit child */
              if (any(childDist < curDist))
              {
                *(sptr_node-1) = cur;
                *(sptr_near-1) = curDist; 
                curDist = childDist;
                cur = child;
              }
              
              /* push hit child onto stack */
              else {
                *(sptr_node-1) = child;
                *(sptr_near-1) = childDist; 
              }
            }	      
          }
        }
        
        /* return if stack is empty */
        if (unlikely(cur == BVH8::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }
        
        /* intersect leaf */
	assert(cur != BVH8::emptyNode);
        const bool8 valid_leaf = ray_tfar > curDist;
        STAT3(shadow.trav_leaves,1,popcnt(valid_leaf),8);
        size_t items; const Triangle* prim = (Triangle*) cur.leaf(items);
        terminated |= PrimitiveIntersector8::occluded(!terminated,pre,ray,prim,items,bvh->scene);
        if (all(terminated)) break;
        ray_tfar = select(terminated,float8(neg_inf),ray_tfar);
      }
      store8i(valid & terminated,&ray.geomID,0);
      AVX_ZERO_UPPER();
    }
  void BVH4iIntersector1<TriangleIntersector>::intersect(const BVH4iIntersector1* This, Ray& ray)
  {
    AVX_ZERO_UPPER();
    STAT3(normal.travs,1,1,1);
    
    /*! stack state */
    const BVH4i* bvh = This->bvh;
    StackItem stack[1+3*BVH4i::maxDepth];  //!< stack of nodes 
    StackItem* stackPtr = stack+1;        //!< current stack pointer
    stack[0].ptr  = bvh->root;
    stack[0].dist = neg_inf;

    /*! offsets to select the side that becomes the lower or upper bound */
    const size_t nearX = ray.dir.x >= 0.0f ? 0*sizeof(ssef_m) : 1*sizeof(ssef_m);
    const size_t nearY = ray.dir.y >= 0.0f ? 2*sizeof(ssef_m) : 3*sizeof(ssef_m);
    const size_t nearZ = ray.dir.z >= 0.0f ? 4*sizeof(ssef_m) : 5*sizeof(ssef_m);
   
    /*! load the ray into SIMD registers */
    const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
    const Vector3f ray_rdir = rcp_safe(ray.dir);
    const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
    const Vector3f ray_org_rdir = ray.org*ray_rdir;
    const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
    const ssef  rayNear(ray.tnear);
    ssef rayFar(ray.tfar);

    const void* nodePtr = bvh->nodePtr();
    const void* triPtr  = bvh->triPtr();
     
    /* pop loop */
    while (true) pop:
    {
      /*! pop next node */
      if (unlikely(stackPtr == stack)) break;
      stackPtr--;
      NodeRef cur = NodeRef(stackPtr->ptr);
      
      /*! if popped node is too far, pop next one */
      if (unlikely(stackPtr->dist > ray.tfar))
        continue;

      /* downtraversal loop */
      while (true)
      {
        /*! stop if we found a leaf */
        if (unlikely(cur.isLeaf())) break;
        STAT3(normal.trav_nodes,1,1,1);
    
        /*! single ray intersection with 4 boxes */
        const Node* node = cur.node(nodePtr);
        const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
        const ssef tNearX = msub(ssef((const char*)nodePtr+(size_t)cur+nearX), rdir.x, org_rdir.x);
        const ssef tNearY = msub(ssef((const char*)nodePtr+(size_t)cur+nearY), rdir.y, org_rdir.y);
        const ssef tNearZ = msub(ssef((const char*)nodePtr+(size_t)cur+nearZ), rdir.z, org_rdir.z);
        const ssef tFarX  = msub(ssef((const char*)nodePtr+(size_t)cur+farX ), rdir.x, org_rdir.x);
        const ssef tFarY  = msub(ssef((const char*)nodePtr+(size_t)cur+farY ), rdir.y, org_rdir.y);
        const ssef tFarZ  = msub(ssef((const char*)nodePtr+(size_t)cur+farZ ), rdir.z, org_rdir.z);
#else
        const ssef tNearX = (norg.x + ssef((const char*)nodePtr+(size_t)cur+nearX)) * rdir.x;
        const ssef tNearY = (norg.y + ssef((const char*)nodePtr+(size_t)cur+nearY)) * rdir.y;
        const ssef tNearZ = (norg.z + ssef((const char*)nodePtr+(size_t)cur+nearZ)) * rdir.z;
        const ssef tFarX  = (norg.x + ssef((const char*)nodePtr+(size_t)cur+farX )) * rdir.x;
        const ssef tFarY  = (norg.y + ssef((const char*)nodePtr+(size_t)cur+farY )) * rdir.y;
        const ssef tFarZ  = (norg.z + ssef((const char*)nodePtr+(size_t)cur+farZ )) * rdir.z;
#endif
        const ssef tNear = max(tNearX,tNearY,tNearZ,rayNear);
        const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
    size_t mask = movemask(tNear <= tFar);
        
        /*! if no child is hit, pop next node */
        if (unlikely(mask == 0))
          goto pop;

        /*! one child is hit, continue with that child */
        size_t r = __bsf(mask); mask = __btc(mask,r);
        if (likely(mask == 0)) {
          cur = node->child(r);
          continue;
        }

        /*! two children are hit, push far child, and continue with closer child */
        NodeRef c0 = node->child(r); const float d0 = tNear[r];
        r = __bsf(mask); mask = __btc(mask,r);
        NodeRef c1 = node->child(r); const float d1 = tNear[r];
        if (likely(mask == 0)) {
          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
        }

        /*! Here starts the slow path for 3 or 4 hit children. We push
         *  all nodes onto the stack to sort them there. */
        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;

        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
        r = __bsf(mask); mask = __btc(mask,r);
        NodeRef c = node->child(r); float d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
        if (likely(mask == 0)) {
          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
          continue;
        }

        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
        r = __bsf(mask); mask = __btc(mask,r);
        c = node->child(r); d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
        sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
      }

      /*! this is a leaf node */
      STAT3(normal.trav_leaves,1,1,1);
      size_t num; Triangle* tri = (Triangle*) cur.leaf(triPtr,num);
      for (size_t i=0; i<num; i++)
        TriangleIntersector::intersect(ray,tri[i],bvh->vertices);
      
      rayFar = ray.tfar;
    }
    AVX_ZERO_UPPER();
  }
示例#10
0
    void BVH8Intersector8Chunk<PrimitiveIntersector8>::intersect(avxb* valid_i, BVH8* bvh, Ray8& ray)
    {
#if defined(__AVX__)
      
      /* load ray */
      const avxb valid0 = *valid_i;
      const avx3f rdir = rcp_safe(ray.dir);
      const avx3f org_rdir = ray.org * rdir;
      avxf ray_tnear = select(valid0,ray.tnear,pos_inf);
      avxf ray_tfar  = select(valid0,ray.tfar ,neg_inf);
      const avxf inf = avxf(pos_inf);
      Precalculations pre(valid0,ray);
      
      /* allocate stack and push root node */
      avxf    stack_near[3*BVH8::maxDepth+1];
      NodeRef stack_node[3*BVH8::maxDepth+1];
      stack_node[0] = BVH8::invalidNode;
      stack_near[0] = inf;
      stack_node[1] = bvh->root;
      stack_near[1] = ray_tnear; 
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      avxf*    __restrict__ sptr_near = stack_near + 2;
      
      while (1)
      {
        /* pop next node from stack */
        sptr_node--;
        sptr_near--;
        NodeRef cur = *sptr_node;
        if (unlikely(cur == BVH8::invalidNode)) 
          break;
        
        /* cull node if behind closest hit point */
        avxf curDist = *sptr_near;
        if (unlikely(none(ray_tfar > curDist))) 
          continue;
        
        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(cur.isLeaf()))
            break;
          
          const avxb valid_node = ray_tfar > curDist;
          STAT3(normal.trav_nodes,1,popcnt(valid_node),8);
          const Node* __restrict__ const node = (BVH8::Node*)cur.node();
          
          /* pop of next node */
          sptr_node--;
          sptr_near--;
          cur = *sptr_node; // FIXME: this trick creates issues with stack depth
          curDist = *sptr_near;
          
          for (unsigned i=0; i<BVH8::N; i++)
          {
            const NodeRef child = node->children[i];
            if (unlikely(child == BVH8::emptyNode)) break;
            
#if defined(__AVX2__)
            const avxf lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x);
            const avxf lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y);
            const avxf lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z);
            const avxf lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x);
            const avxf lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y);
            const avxf lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z);
            const avxf lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
            const avxf lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
            const avxb lhit   = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar);      
#else
            const avxf lclipMinX = node->lower_x[i] * rdir.x - org_rdir.x;
            const avxf lclipMinY = node->lower_y[i] * rdir.y - org_rdir.y;
            const avxf lclipMinZ = node->lower_z[i] * rdir.z - org_rdir.z;
            const avxf lclipMaxX = node->upper_x[i] * rdir.x - org_rdir.x;
            const avxf lclipMaxY = node->upper_y[i] * rdir.y - org_rdir.y;
            const avxf lclipMaxZ = node->upper_z[i] * rdir.z - org_rdir.z;
            const avxf lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const avxf lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const avxb lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);      
#endif
            
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              const avxf childDist = select(lhit,lnearP,inf);
              const NodeRef child = node->children[i];
              
              /* push cur node onto stack and continue with hit child */
              if (any(childDist < curDist))
              {
                *sptr_node = cur;
                *sptr_near = curDist; 
		sptr_node++;
		sptr_near++;

                curDist = childDist;
                cur = child;
              }
              
              /* push hit child onto stack*/
              else {
                *sptr_node = child;
                *sptr_near = childDist; 
		sptr_node++;
		sptr_near++;

              }
              assert(sptr_node - stack_node < BVH8::maxDepth);
            }	      
          }
        }
        
        /* return if stack is empty */
        if (unlikely(cur == BVH8::invalidNode)) 
          break;
        
        /* intersect leaf */
	assert(cur != BVH8::emptyNode);
        const avxb valid_leaf = ray_tfar > curDist;
        STAT3(normal.trav_leaves,1,popcnt(valid_leaf),8);
        size_t items; const Triangle* tri  = (Triangle*) cur.leaf(items);
        PrimitiveIntersector8::intersect(valid_leaf,pre,ray,tri,items,bvh->geometry);
        ray_tfar = select(valid_leaf,ray.tfar,ray_tfar);
      }
      AVX_ZERO_UPPER();
#endif       
    }
    void BVH4iIntersector4Chunk<TriangleIntersector4>::occluded(sseb* valid_i, BVH4i* bvh, Ray4& ray)
    {
      /* load node and primitive array */
      const Node      * __restrict__ nodes  = (Node    *)bvh->nodePtr();
      const Triangle * __restrict__ accel = (Triangle*)bvh->triPtr();
      
      /* load ray */
      const sseb valid = *valid_i;
      sseb terminated = !valid;
      const sse3f rdir = rcp_safe(ray.dir);
      const sse3f org_rdir = ray.org * rdir;
      ssef ray_tnear = select(valid,ray.tnear,pos_inf);
      ssef ray_tfar  = select(valid,ray.tfar ,neg_inf);
      const ssef inf = ssef(pos_inf);
      
      /* allocate stack and push root node */
      ssef    stack_near[3*BVH4i::maxDepth+1];
      NodeRef stack_node[3*BVH4i::maxDepth+1];
      stack_node[0] = BVH4i::invalidNode;
      stack_near[0] = inf;
      stack_node[1] = bvh->root;
      stack_near[1] = ray_tnear; 
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      ssef*    __restrict__ sptr_near = stack_near + 2;
      
      while (1)
      {
        /* pop next node from stack */
        sptr_node--;
        sptr_near--;
        NodeRef curNode = *sptr_node;
        if (unlikely(curNode == BVH4i::invalidNode)) 
          break;
        
        /* cull node if behind closest hit point */
        ssef curDist = *sptr_near;
        if (unlikely(none(ray_tfar > curDist))) 
          continue;
        
        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf()))
            break;
          
          const sseb valid_node = ray_tfar > curDist;
          STAT3(shadow.trav_nodes,1,popcnt(valid_node),4);
          const Node* __restrict__ const node = curNode.node(nodes);
          
          /* pop of next node */
          sptr_node--;
          sptr_near--;
          curNode = *sptr_node; // FIXME: this trick creates issues with stack depth
          curDist = *sptr_near;
          
#pragma unroll(4)
          for (unsigned i=0; i<4; i++)
          {
            const NodeRef child = node->children[i];
            if (unlikely(child == BVH4i::emptyNode)) break;
            
#if defined(__AVX2__)
            const ssef lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x);
            const ssef lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y);
            const ssef lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z);
            const ssef lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x);
            const ssef lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y);
            const ssef lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z);
            const ssef lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
            const ssef lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
            const sseb lhit   = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar);      
#else
            const ssef lclipMinX = node->lower_x[i] * rdir.x - org_rdir.x;
            const ssef lclipMinY = node->lower_y[i] * rdir.y - org_rdir.y;
            const ssef lclipMinZ = node->lower_z[i] * rdir.z - org_rdir.z;
            const ssef lclipMaxX = node->upper_x[i] * rdir.x - org_rdir.x;
            const ssef lclipMaxY = node->upper_y[i] * rdir.y - org_rdir.y;
            const ssef lclipMaxZ = node->upper_z[i] * rdir.z - org_rdir.z;
            const ssef lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const ssef lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const sseb lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);      
#endif
            
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              const ssef childDist = select(lhit,lnearP,inf);
              sptr_node++;
              sptr_near++;
              
              /* push cur node onto stack and continue with hit child */
              if (any(childDist < curDist))
              {
                *(sptr_node-1) = curNode;
                *(sptr_near-1) = curDist; 
                curDist = childDist;
                curNode = child;
              }
              
              /* push hit child onto stack*/
              else {
                *(sptr_node-1) = child;
                *(sptr_near-1) = childDist; 
              }
              assert(sptr_node - stack_node < BVH4i::maxDepth);
            }	      
          }
        }
        
        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) 
          break;
        
        /* intersect leaf */
        const sseb valid_leaf = ray_tfar > curDist;
        STAT3(shadow.trav_leaves,1,popcnt(valid_leaf),4);
        size_t items; const Triangle* tri  = (Triangle*) curNode.leaf(accel, items);
        terminated |= TriangleIntersector4::occluded(!terminated,ray,tri,items,bvh->geometry);
        if (all(terminated)) break;
        ray_tfar = select(terminated,neg_inf,ray_tfar);
      }
      store4i(valid & terminated,&ray.geomID,0);
      AVX_ZERO_UPPER();
    }
    void BVH8iIntersector8Hybrid<TriangleIntersector8>::occluded(avxb* valid_i, BVH8i* bvh, Ray8& ray)
    {
      /* load ray */
      const avxb valid = *valid_i;
      avxb terminated = !valid;
      avx3f ray_org = ray.org, ray_dir = ray.dir;
      avxf ray_tnear = ray.tnear, ray_tfar  = ray.tfar;
#if defined(__FIX_RAYS__)
      const avxf float_range = 0.1f*FLT_MAX;
      ray_org = clamp(ray_org,avx3f(-float_range),avx3f(+float_range));
      ray_dir = clamp(ray_dir,avx3f(-float_range),avx3f(+float_range));
      ray_tnear = max(ray_tnear,FLT_MIN); 
      ray_tfar  = min(ray_tfar,float(inf)); 
#endif
      const avx3f rdir = rcp_safe(ray_dir);
      const avx3f org(ray_org), org_rdir = org * rdir;
      ray_tnear = select(valid,ray_tnear,avxf(pos_inf));
      ray_tfar  = select(valid,ray_tfar ,avxf(neg_inf));
      const avxf inf = avxf(pos_inf);
      
      /* compute near/far per ray */
      avx3i nearXYZ;
      nearXYZ.x = select(rdir.x >= 0.0f,avxi(0*(int)sizeof(avxf)),avxi(1*(int)sizeof(avxf)));
      nearXYZ.y = select(rdir.y >= 0.0f,avxi(2*(int)sizeof(avxf)),avxi(3*(int)sizeof(avxf)));
      nearXYZ.z = select(rdir.z >= 0.0f,avxi(4*(int)sizeof(avxf)),avxi(5*(int)sizeof(avxf)));

      /* allocate stack and push root node */
      avxf    stack_near[stackSizeChunk];
      NodeRef stack_node[stackSizeChunk];
      stack_node[0] = BVH4i::invalidNode;
      stack_near[0] = inf;
      stack_node[1] = bvh->root;
      stack_near[1] = ray_tnear; 
      NodeRef* stackEnd = stack_node+stackSizeChunk;
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      avxf*    __restrict__ sptr_near = stack_near + 2;

      const Node     * __restrict__ nodes = (Node    *)bvh->nodePtr();
      const Triangle * __restrict__ accel = (Triangle*)bvh->triPtr();
      
      while (1)
      {
        /* pop next node from stack */
        assert(sptr_node > stack_node);
        sptr_node--;
        sptr_near--;
        NodeRef curNode = *sptr_node;
        if (unlikely(curNode == BVH4i::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }

        /* cull node if behind closest hit point */
        avxf curDist = *sptr_near;
        const avxb active = curDist < ray_tfar;
        if (unlikely(none(active))) 
          continue;
        
        /* switch to single ray traversal */
#if !defined(__WIN32__) || defined(__X86_64__)
        size_t bits = movemask(active);
        if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) {
          for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) {
            if (occluded1(bvh,curNode,i,ray,ray_org,ray_dir,rdir,ray_tnear,ray_tfar,nearXYZ))
              terminated[i] = -1;
          }
          if (all(terminated)) break;
          ray_tfar = select(terminated,avxf(neg_inf),ray_tfar);
          continue;
        }
#endif
                
        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf()))
            break;
          
          const avxb valid_node = ray_tfar > curDist;
          STAT3(shadow.trav_nodes,1,popcnt(valid_node),8);
          const Node* __restrict__ const node = (Node*)curNode.node(nodes);
          
          /* pop of next node */
          assert(sptr_node > stack_node);
          sptr_node--;
          sptr_near--;
          curNode = *sptr_node;
          curDist = *sptr_near;
          
          for (unsigned i=0; i<8; i++)
          {
            const NodeRef child = node->children[i];
            if (unlikely(child == BVH4i::emptyNode)) break;
            
#if defined(__AVX2__)
            const avxf lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x);
            const avxf lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y);
            const avxf lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z);
            const avxf lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x);
            const avxf lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y);
            const avxf lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z);
            const avxf lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
            const avxf lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
            const avxb lhit   = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar);      
#else
            const avxf lclipMinX = (node->lower_x[i] - org.x) * rdir.x;
            const avxf lclipMinY = (node->lower_y[i] - org.y) * rdir.y;
            const avxf lclipMinZ = (node->lower_z[i] - org.z) * rdir.z;
            const avxf lclipMaxX = (node->upper_x[i] - org.x) * rdir.x;
            const avxf lclipMaxY = (node->upper_y[i] - org.y) * rdir.y;
            const avxf lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z;
            const avxf lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const avxf lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const avxb lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);      
#endif
            
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              assert(sptr_node < stackEnd);
              assert(child != BVH4i::emptyNode);
              const avxf childDist = select(lhit,lnearP,inf);
              sptr_node++;
              sptr_near++;
              
              /* push cur node onto stack and continue with hit child */
              if (any(childDist < curDist))
              {
                *(sptr_node-1) = curNode;
                *(sptr_near-1) = curDist; 
                curDist = childDist;
                curNode = child;
              }
              
              /* push hit child onto stack */
              else {
                *(sptr_node-1) = child;
                *(sptr_near-1) = childDist; 
              }
            }	      
          }
        }
        
        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }
        
        /* intersect leaf */
        const avxb valid_leaf = ray_tfar > curDist;
        STAT3(shadow.trav_leaves,1,popcnt(valid_leaf),8);
        size_t items; const Triangle* prim = (Triangle*) curNode.leaf(accel,items);
        terminated |= TriangleIntersector8::occluded(!terminated,ray,prim,items,bvh->geometry);
        if (all(terminated)) break;
        ray_tfar = select(terminated,avxf(neg_inf),ray_tfar);
      }
      store8i(valid & terminated,&ray.geomID,0);
      AVX_ZERO_UPPER();
    }
示例#13
0
static int
mprimef(unsigned int *n, unsigned int *q, int k)
{
	int i, j;
	unsigned int *t, *x, *y;

	// generate x

	t = mcopy(n);

	while (1) {
		for (i = 0; i < MLENGTH(t); i++)
			t[i] = rand();
		x = mmod(t, n);
		if (!MZERO(x) && !MEQUAL(x, 1))
			break;
		mfree(x);
	}

	mfree(t);

	// exponentiate

	y = mmodpow(x, q, n);

	// done?

	if (MEQUAL(y, 1)) {
		mfree(x);
		mfree(y);
		return 1;
	}

	j = 0;

	while (1) {

		// y = n - 1?

		t = msub(n, y);

		if (MEQUAL(t, 1)) {
			mfree(t);
			mfree(x);
			mfree(y);
			return 1;
		}

		mfree(t);

		if (++j == k) {
			mfree(x);
			mfree(y);
			return 0;
		}

		// y = (y ^ 2) mod n

		t = mmul(y, y);
		mfree(y);
		y = mmod(t, n);
		mfree(t);

		// y = 1?

		if (MEQUAL(y, 1)) {
			mfree(x);
			mfree(y);
			return 0;
		}
	}
}
    __forceinline bool BVH4Intersector4Hybrid<PrimitiveIntersector4>::occluded1(const BVH4* bvh, NodeRef root, size_t k, Ray4& ray, 
                                                                                const sse3f& ray_org, const sse3f& ray_dir, const sse3f& ray_rdir, 
                                                                                const ssef& ray_tnear, const ssef& ray_tfar)
    {
      /*! stack state */
      NodeRef stack[stackSizeSingle];  //!< stack of nodes that still need to get traversed
      NodeRef* stackPtr = stack+1;        //!< current stack pointer
      NodeRef* stackEnd = stack+stackSizeSingle;
      stack[0]  = root;
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray_dir.x[k] >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray_dir.y[k] >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray_dir.z[k] >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
      
      /*! load the ray into SIMD registers */
      const sse3f org (ray_org .x[k],ray_org .y[k],ray_org .z[k]);
      const sse3f rdir(ray_rdir.x[k],ray_rdir.y[k],ray_rdir.z[k]);
      const sse3f norg = -org, org_rdir(org*rdir);
      const ssef rayNear(ray_tnear[k]), rayFar(ray_tfar[k]); 
      
      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = (NodeRef) *stackPtr;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(shadow.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;
#endif
          
#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,rayNear));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,rayFar ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
#else
          const ssef tNear = max(tNearX,tNearY,tNearZ,rayNear);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const float d0 = tNear[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const float d1 = tNear[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd);
            if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
            else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
          }
          assert(stackPtr < stackEnd);
          *stackPtr = c0; stackPtr++;
          assert(stackPtr < stackEnd);
          *stackPtr = c1; stackPtr++;
          
          /*! three children are hit */
          r = __bscf(mask);
          cur = node->child(r); 
          assert(cur != BVH4::emptyNode);
          if (likely(mask == 0)) continue;
          assert(stackPtr < stackEnd);
          *stackPtr = cur; stackPtr++;
          
          /*! four children are hit */
          cur = node->child(3);
          assert(cur != BVH4::emptyNode);
        }
        
        /*! this is a leaf node */
        STAT3(shadow.trav_leaves,1,1,1);
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        if (PrimitiveIntersector4::occluded(ray,k,prim,num,bvh->geometry)) {
          ray.geomID[k] = 0;
          return true;
        }
      }
      return false;
    }
    __forceinline void BVH4Intersector4Hybrid<PrimitiveIntersector4>::intersect1(const BVH4* bvh, NodeRef root, size_t k, Ray4& ray, 
                                                                                 const sse3f& ray_org, const sse3f& ray_dir, const sse3f& ray_rdir, 
                                                                                 const ssef& ray_tnear, const ssef& ray_tfar)
    {
      /*! stack state */
      StackItem stack[stackSizeSingle];  //!< stack of nodes 
      StackItem* stackPtr = stack+1;        //!< current stack pointer
      StackItem* stackEnd = stack+stackSizeSingle;
      stack[0].ptr = root;
      stack[0].dist = neg_inf;
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray_dir.x[k] >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray_dir.y[k] >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray_dir.z[k] >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
      
      /*! load the ray into SIMD registers */
      const sse3f org (ray_org .x[k],ray_org .y[k],ray_org .z[k]);
      const sse3f rdir(ray_rdir.x[k],ray_rdir.y[k],ray_rdir.z[k]);
      const sse3f norg = -org, org_rdir(org*rdir);
      ssef rayNear(ray_tnear[k]), rayFar(ray_tfar[k]); 
      
      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = NodeRef(stackPtr->ptr);
        
        /*! if popped node is too far, pop next one */
        if (unlikely(stackPtr->dist > ray.tfar[k]))
          continue;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(normal.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;
#endif

#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,rayNear));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,rayFar ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
#else
          const ssef tNear = max(tNearX,tNearY,tNearZ,rayNear);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const float d0 = tNear[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const float d1 = tNear[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          }
          
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); float d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          if (likely(mask == 0)) {
            sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
            continue;
          }
          
          /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          c = node->child(r); d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
        }
        
        /*! this is a leaf node */
        STAT3(normal.trav_leaves,1,1,1);
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        PrimitiveIntersector4::intersect(ray,k,prim,num,bvh->geometry);
        rayFar = ray.tfar[k];
      }
    }
    void BVH4Intersector4Hybrid<PrimitiveIntersector4>::intersect(sseb* valid_i, BVH4* bvh, Ray4& ray)
    {
      /* load ray */
      const sseb valid0 = *valid_i;
      sse3f ray_org = ray.org, ray_dir = ray.dir;
      ssef ray_tnear = ray.tnear, ray_tfar  = ray.tfar;
#if defined(__FIX_RAYS__)
      const ssef float_range = 0.1f*FLT_MAX;
      ray_org = clamp(ray_org,sse3f(-float_range),sse3f(+float_range));
      ray_dir = clamp(ray_dir,sse3f(-float_range),sse3f(+float_range));
      ray_tnear = max(ray_tnear,FLT_MIN); 
      ray_tfar  = min(ray_tfar,float(inf)); 
#endif
      const sse3f rdir = rcp_safe(ray_dir);
      const sse3f org(ray_org), org_rdir = org * rdir;
      ray_tnear = select(valid0,ray_tnear,ssef(pos_inf));
      ray_tfar  = select(valid0,ray_tfar ,ssef(neg_inf));
      const ssef inf = ssef(pos_inf);
      
      /* allocate stack and push root node */
      ssef    stack_near[stackSizeChunk]; 
      NodeRef stack_node[stackSizeChunk];
      stack_node[0] = BVH4::invalidNode;
      stack_near[0] = inf;
      stack_node[1] = bvh->root;
      stack_near[1] = ray_tnear; 
      NodeRef* stackEnd = stack_node+stackSizeChunk;
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      ssef*    __restrict__ sptr_near = stack_near + 2;
      
      while (1)
      {
        /* pop next node from stack */
        assert(sptr_node > stack_node);
        sptr_node--;
        sptr_near--;
        NodeRef curNode = *sptr_node;
        if (unlikely(curNode == BVH4::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }
        
        /* cull node if behind closest hit point */
        ssef curDist = *sptr_near;
        const sseb active = curDist < ray_tfar;
        if (unlikely(none(active))) 
          continue;
        
        /* switch to single ray traversal */
#if !defined(__WIN32__) || defined(__X86_64__)
        size_t bits = movemask(active);
        if (unlikely(__popcnt(bits) <= SWITCH_THRESHOLD)) {
          for (size_t i=__bsf(bits); bits!=0; bits=__btc(bits,i), i=__bsf(bits)) {
            intersect1(bvh,curNode,i,ray,ray_org,ray_dir,rdir,ray_tnear,ray_tfar);
          }
          ray_tfar = ray.tfar;
          continue;
        }
#endif
        
        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf()))
            break;
          
          const sseb valid_node = ray_tfar > curDist;
          STAT3(normal.trav_nodes,1,popcnt(valid_node),4);
          const Node* __restrict__ const node = curNode.node();
          
          /* pop of next node */
          assert(sptr_node > stack_node);
          sptr_node--;
          sptr_near--;
          curNode = *sptr_node; 
          curDist = *sptr_near;
          
#pragma unroll(4)
          for (unsigned i=0; i<4; i++)
          {
            const NodeRef child = node->children[i];
            if (unlikely(child == BVH4::emptyNode)) break;
            
#if defined(__AVX2__)
            const ssef lclipMinX = msub(node->lower_x[i],rdir.x,org_rdir.x);
            const ssef lclipMinY = msub(node->lower_y[i],rdir.y,org_rdir.y);
            const ssef lclipMinZ = msub(node->lower_z[i],rdir.z,org_rdir.z);
            const ssef lclipMaxX = msub(node->upper_x[i],rdir.x,org_rdir.x);
            const ssef lclipMaxY = msub(node->upper_y[i],rdir.y,org_rdir.y);
            const ssef lclipMaxZ = msub(node->upper_z[i],rdir.z,org_rdir.z);
#else
            const ssef lclipMinX = (node->lower_x[i] - org.x) * rdir.x;
            const ssef lclipMinY = (node->lower_y[i] - org.y) * rdir.y;
            const ssef lclipMinZ = (node->lower_z[i] - org.z) * rdir.z;
            const ssef lclipMaxX = (node->upper_x[i] - org.x) * rdir.x;
            const ssef lclipMaxY = (node->upper_y[i] - org.y) * rdir.y;
            const ssef lclipMaxZ = (node->upper_z[i] - org.z) * rdir.z;
#endif
    
#if defined(__SSE4_1__)
            const ssef lnearP = maxi(maxi(mini(lclipMinX, lclipMaxX), mini(lclipMinY, lclipMaxY)), mini(lclipMinZ, lclipMaxZ));
            const ssef lfarP  = mini(mini(maxi(lclipMinX, lclipMaxX), maxi(lclipMinY, lclipMaxY)), maxi(lclipMinZ, lclipMaxZ));
            const sseb lhit   = maxi(lnearP,ray_tnear) <= mini(lfarP,ray_tfar);      
#else
            const ssef lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const ssef lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const sseb lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);      
#endif
        
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              assert(sptr_node < stackEnd);
              const ssef childDist = select(lhit,lnearP,inf);
              const NodeRef child = node->children[i];
              assert(child != BVH4::emptyNode);
              sptr_node++;
              sptr_near++;

              /* push cur node onto stack and continue with hit child */
              if (any(childDist < curDist))
              {
                *(sptr_node-1) = curNode;
                *(sptr_near-1) = curDist; 
                curDist = childDist;
                curNode = child;
              }
              
              /* push hit child onto stack */
              else {
                *(sptr_node-1) = child;
                *(sptr_near-1) = childDist; 
              }
            }	      
          }
        }
        
        /* return if stack is empty */
        if (unlikely(curNode == BVH4::invalidNode)) {
          assert(sptr_node == stack_node);
          break;
        }
        
        /* intersect leaf */
        const sseb valid_leaf = ray_tfar > curDist;
        STAT3(normal.trav_leaves,1,popcnt(valid_leaf),4);
        size_t items; const Primitive* prim = (Primitive*) curNode.leaf(items);
        PrimitiveIntersector4::intersect(valid_leaf,ray,prim,items,bvh->geometry);
        ray_tfar = select(valid_leaf,ray.tfar,ray_tfar);
      }
      AVX_ZERO_UPPER();
    }
示例#17
0
  __forceinline void intersectT(const BVH4* bvh, Ray& ray)
  {
    typedef typename TriangleIntersector::Triangle Triangle;
    typedef StackItemT<size_t> StackItem;
    typedef typename BVH4::NodeRef NodeRef;
    typedef typename BVH4::Node Node;

    /*! stack state */
    StackItem stack[1+3*BVH4::maxDepth];  //!< stack of nodes 
    StackItem* stackPtr = stack+1;        //!< current stack pointer
    stack[0].ptr  = bvh->root;
    stack[0].dist = neg_inf;

    /*! load the ray into SIMD registers */
    const avxf pos_neg = avxf(ssef(+0.0f),ssef(-0.0f));
    const avxf neg_pos = avxf(ssef(-0.0f),ssef(+0.0f));
    const avxf flipSignX = swapX ? neg_pos : pos_neg;
    const avxf flipSignY = swapY ? neg_pos : pos_neg;
    const avxf flipSignZ = swapZ ? neg_pos : pos_neg;
    const Vector3f ray_rdir = rcp_safe(ray.dir);
    const avx3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
    const avx3f rdir(ray_rdir.x^flipSignX,ray_rdir.y^flipSignY,ray_rdir.z^flipSignZ);
    const avx3f org_rdir(avx3f(ray.org.x,ray.org.y,ray.org.z)*rdir);
    avxf rayNearFar(ssef(ray.tnear),-ssef(ray.tfar));

    const void* nodePtr = bvh->nodePtr();
    const void* triPtr  = bvh->triPtr();
     
    /* pop loop */
    while (true) pop:
    {
      /*! pop next node */
      if (unlikely(stackPtr == stack)) break;
      stackPtr--;
      NodeRef cur = NodeRef(stackPtr->ptr);
      
      /*! if popped node is too far, pop next one */
      if (unlikely(stackPtr->dist > ray.tfar))
        continue;

      /* downtraversal loop */
      while (true)
      {
        /*! stop if we found a leaf */
        if (unlikely(cur.isLeaf())) break;
        STAT3(normal.trav_nodes,1,1,1);

        /*! single ray intersection with 4 boxes */
        const Node* node = cur.node(nodePtr);

#if defined (__AVX2__) || defined(__MIC__)
        const avxf tLowerUpperX = msub(avxf::load(&node->lower_x), rdir.x, org_rdir.x);
        const avxf tLowerUpperY = msub(avxf::load(&node->lower_y), rdir.y, org_rdir.y);
        const avxf tLowerUpperZ = msub(avxf::load(&node->lower_z), rdir.z, org_rdir.z);
#else
        const avxf tLowerUpperX = (norg.x + avxf::load(&node->lower_x)) * rdir.x;
        const avxf tLowerUpperY = (norg.y + avxf::load(&node->lower_y)) * rdir.y;
        const avxf tLowerUpperZ = (norg.z + avxf::load(&node->lower_z)) * rdir.z;
#endif
        const avxf tNearFarX = swapX ? shuffle<1,0>(tLowerUpperX) : tLowerUpperX;
        const avxf tNearFarY = swapY ? shuffle<1,0>(tLowerUpperY) : tLowerUpperY;
        const avxf tNearFarZ = swapZ ? shuffle<1,0>(tLowerUpperZ) : tLowerUpperZ;
        const avxf tNearFar = max(tNearFarX,tNearFarY,tNearFarZ,rayNearFar);
        const ssef tNear = extract<0>(tNearFar);
        const ssef tFar  = extract<1>(tNearFar);
        size_t mask = movemask(-tNear >= tFar);
                
        /*! if no child is hit, pop next node */
        if (unlikely(mask == 0))
          goto pop;

        /*! one child is hit, continue with that child */
        size_t r = __bsf(mask); mask = __btc(mask,r);
        if (likely(mask == 0)) {
          cur = node->child(r);
          continue;
        }

        /*! two children are hit, push far child, and continue with closer child */
        NodeRef c0 = node->child(r); const float d0 = tNear[r];
        r = __bsf(mask); mask = __btc(mask,r);
        NodeRef c1 = node->child(r); const float d1 = tNear[r];
        if (likely(mask == 0)) {
          if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
          else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
        }

        /*! Here starts the slow path for 3 or 4 hit children. We push
         *  all nodes onto the stack to sort them there. */
        stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
        stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;

        /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
        r = __bsf(mask); mask = __btc(mask,r);
        NodeRef c = node->child(r); float d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
        if (likely(mask == 0)) {
          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
          continue;
        }

        /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
        r = __bsf(mask); mask = __btc(mask,r);
        c = node->child(r); d = tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
        sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
        cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
      }

      /*! this is a leaf node */
      STAT3(normal.trav_leaves,1,1,1);
      size_t num; Triangle* tri = (Triangle*) cur.leaf(triPtr,num);
      for (size_t i=0; i<num; i++)
        TriangleIntersector::intersect(ray,tri[i],bvh->vertices);
      
      rayNearFar = insert<1>(rayNearFar,-ssef(ray.tfar));
    }
  }
示例#18
0
  __forceinline bool occludedT(const BVH4* bvh, Ray& ray)
  {
    typedef typename TriangleIntersector::Triangle Triangle;
    typedef StackItemT<size_t> StackItem;
    typedef typename BVH4::NodeRef NodeRef;
    typedef typename BVH4::Node Node;

    /*! stack state */
    NodeRef stack[1+3*BVH4::maxDepth];  //!< stack of nodes that still need to get traversed
    NodeRef* stackPtr = stack+1;        //!< current stack pointer
    stack[0]  = bvh->root;
  
    /*! load the ray into SIMD registers */
    const avxf pos_neg = avxf(ssef(+0.0f),ssef(-0.0f));
    const avxf neg_pos = avxf(ssef(-0.0f),ssef(+0.0f));
    const avxf flipSignX = swapX ? neg_pos : pos_neg;
    const avxf flipSignY = swapY ? neg_pos : pos_neg;
    const avxf flipSignZ = swapZ ? neg_pos : pos_neg;
    const avx3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
    const Vector3f ray_rdir = rcp_safe(ray.dir);
    const avx3f rdir(ray_rdir.x^flipSignX,ray_rdir.y^flipSignY,ray_rdir.z^flipSignZ);
    const avx3f org_rdir(avx3f(ray.org.x,ray.org.y,ray.org.z)*rdir);
    const avxf rayNearFar(ssef(ray.tnear),-ssef(ray.tfar));

    const void* nodePtr = bvh->nodePtr();
    const void* triPtr  = bvh->triPtr();
    
    /* pop loop */
    while (true) pop:
    {
      /*! pop next node */
      if (unlikely(stackPtr == stack)) break;
      stackPtr--;
      NodeRef cur = (NodeRef) *stackPtr;
      
      /* downtraversal loop */
      while (true)
      {
        /*! stop if we found a leaf */
        if (unlikely(cur.isLeaf())) break;
        STAT3(shadow.trav_nodes,1,1,1);
	
        /*! single ray intersection with 4 boxes */
        const Node* node = cur.node(nodePtr);
       
#if defined (__AVX2__) || defined(__MIC__)
        const avxf tLowerUpperX = msub(avxf::load(&node->lower_x), rdir.x, org_rdir.x);
        const avxf tLowerUpperY = msub(avxf::load(&node->lower_y), rdir.y, org_rdir.y);
        const avxf tLowerUpperZ = msub(avxf::load(&node->lower_z), rdir.z, org_rdir.z);
#else
        const avxf tLowerUpperX = (norg.x + avxf::load(&node->lower_x)) * rdir.x;
        const avxf tLowerUpperY = (norg.y + avxf::load(&node->lower_y)) * rdir.y;
        const avxf tLowerUpperZ = (norg.z + avxf::load(&node->lower_z)) * rdir.z;
#endif
        const avxf tNearFarX = swapX ? shuffle<1,0>(tLowerUpperX) : tLowerUpperX;
        const avxf tNearFarY = swapY ? shuffle<1,0>(tLowerUpperY) : tLowerUpperY;
        const avxf tNearFarZ = swapZ ? shuffle<1,0>(tLowerUpperZ) : tLowerUpperZ;
        const avxf tNearFar = max(tNearFarX,tNearFarY,tNearFarZ,rayNearFar);
        const ssef tNear = extract<0>(tNearFar);
        const ssef tFar  = extract<1>(tNearFar);
        size_t mask = movemask(-tNear >= tFar);
        
        /*! if no child is hit, pop next node */
        if (unlikely(mask == 0))
          goto pop;

        /*! one child is hit, continue with that child */
        size_t r = __bsf(mask); mask = __btc(mask,r);
        if (likely(mask == 0)) {
          cur = node->child(r);
          continue;
        }

        /*! two children are hit, push far child, and continue with closer child */
        NodeRef c0 = node->child(r); const float d0 = tNear[r];
        r = __bsf(mask); mask = __btc(mask,r);
        NodeRef c1 = node->child(r); const float d1 = tNear[r];
        if (likely(mask == 0)) {
          if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
          else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
        }
        *stackPtr = c0; stackPtr++;
        *stackPtr = c1; stackPtr++;

        /*! three children are hit */
        r = __bsf(mask); mask = __btc(mask,r);
        cur = node->child(r); *stackPtr = cur; stackPtr++;
        if (likely(mask == 0)) {
          stackPtr--;
          continue;
        }

        /*! four children are hit */
        cur = node->child(3);
      }

      /*! this is a leaf node */
      STAT3(shadow.trav_leaves,1,1,1);
      size_t num; Triangle* tri = (Triangle*) cur.leaf(triPtr,num);
      for (size_t i=0; i<num; i++) {
        if (TriangleIntersector::occluded(ray,tri[i],bvh->vertices)) {
          AVX_ZERO_UPPER();
          return true;
        }
      }
    }
   
    AVX_ZERO_UPPER();
    return false;
  }
示例#19
0
unsigned int *
mfactor(unsigned int *n)
{
	unsigned int *r, *root, *t, *two, *x, *y;

	two = mint(2);

	root = msqrt(n);

	// y = 1;

	y = mint(1);

	// x = 2 isqrt(n) + 1

	t = madd(root, root);
	x = madd(t, y);
	mfree(t);

	// r = isqrt(n) ^ 2 - n

	t = mmul(root, root);
	r = msub(t, n);
	mfree(t);

	mfree(root);

	while (1) {

		if (MZERO(r)) {

			// n = (x - y) / 2

			t = msub(x, y);
			n = mdiv(t, two);
			mfree(t);

			mfree(r);
			mfree(x);
			mfree(y);
			mfree(two);

			return n;
		}

		// r = r + x

		t = madd(r, x);
		mfree(r);
		r = t;

		// x = x + 2

		t = madd(x, two);
		mfree(x);
		x = t;

		while (1) {

			// r = r - y

			t = msub(r, y);
			mfree(r);
			r = t;

			// y = y + 2

			t = madd(y, two);
			mfree(y);
			y = t;

			if (MSIGN(r) == -1 || MZERO(r))
				break;
		}
	}
}
    __forceinline void BVH8iIntersector8Hybrid<TriangleIntersector8>::intersect1(const BVH8i* bvh, NodeRef root, const size_t k, Ray8& ray,const avx3f &ray_org, const avx3f &ray_dir, const avx3f &ray_rdir, const avxf &ray_tnear, const avxf &ray_tfar, const avx3i& nearXYZ)
    {
      /*! stack state */
      StackItemInt64 stack[stackSizeSingle];  //!< stack of nodes 
      StackItemInt64* stackPtr = stack+1;        //!< current stack pointer
      StackItemInt64* stackEnd = stack+stackSizeSingle;
      stack[0].ptr = root;
      stack[0].dist = neg_inf;
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = nearXYZ.x[k];
      const size_t nearY = nearXYZ.y[k];
      const size_t nearZ = nearXYZ.z[k];

      /*! load the ray into SIMD registers */
      const avx3f org (ray_org .x[k],ray_org .y[k],ray_org .z[k]);
      const avx3f rdir(ray_rdir.x[k],ray_rdir.y[k],ray_rdir.z[k]);
      const avx3f org_rdir(org*rdir);
      avxf rayNear(ray_tnear[k]), rayFar(ray_tfar[k]);
     
      const Node     * __restrict__ nodes = (Node    *)bvh->nodePtr();
      const Triangle * __restrict__ accel = (Triangle*)bvh->triPtr();
 
      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = NodeRef(stackPtr->ptr);
        
        /*! if popped node is too far, pop next one */
        if (unlikely(*(float*)&stackPtr->dist > ray.tfar[k]))
          continue;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(normal.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = (Node*)cur.node(nodes);
          const size_t farX  = nearX ^ sizeof(avxf), farY  = nearY ^ sizeof(avxf), farZ  = nearZ ^ sizeof(avxf);
#if defined (__AVX2__)
          const avxf tNearX = msub(load8f((const char*)node+nearX), rdir.x, org_rdir.x);
          const avxf tNearY = msub(load8f((const char*)node+nearY), rdir.y, org_rdir.y);
          const avxf tNearZ = msub(load8f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const avxf tFarX  = msub(load8f((const char*)node+farX ), rdir.x, org_rdir.x);
          const avxf tFarY  = msub(load8f((const char*)node+farY ), rdir.y, org_rdir.y);
          const avxf tFarZ  = msub(load8f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const avxf tNearX = (load8f((const char*)node+nearX) - org.x) * rdir.x;
          const avxf tNearY = (load8f((const char*)node+nearY) - org.y) * rdir.y;
          const avxf tNearZ = (load8f((const char*)node+nearZ) - org.z) * rdir.z;
          const avxf tFarX  = (load8f((const char*)node+farX ) - org.x) * rdir.x;
          const avxf tFarY  = (load8f((const char*)node+farY ) - org.y) * rdir.y;
          const avxf tFarZ  = (load8f((const char*)node+farZ ) - org.z) * rdir.z;
#endif

#if defined(__AVX2__)
          const avxf tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,rayNear));
          const avxf tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,rayFar ));
          const avxb vmask = cast(tNear) > cast(tFar);
          unsigned int mask = movemask(vmask)^0xff;
#else
          const avxf tNear = max(tNearX,tNearY,tNearZ,rayNear);
          const avxf tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
          const avxb vmask = tNear <= tFar;
          unsigned int mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4i::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH4i::emptyNode);
          assert(c1 != BVH4i::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          }
          
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c0 != BVH4i::emptyNode);
          if (likely(mask == 0)) {
            sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
            continue;
          }
          
          /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          c = node->child(r); d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4i::emptyNode);
	  if (likely(mask == 0)) {
	    sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
	    cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
	    continue;
	  }

	  while(1)
	    {
	      r = __bscf(mask);
	      c = node->child(r); d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
	      if (unlikely(mask == 0)) break;
	    }
	  cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
	  
        }
        
        /*! this is a leaf node */
        STAT3(normal.trav_leaves,1,1,1);
        size_t num; Triangle* prim = (Triangle*) cur.leaf(accel,num);
        TriangleIntersector8::intersect(ray,k,prim,num,bvh->geometry);
        rayFar = ray.tfar[k];
      }
    }
    __forceinline bool BVH8iIntersector8Hybrid<TriangleIntersector8>::occluded1(const BVH8i* bvh, NodeRef root, const size_t k, Ray8& ray,const avx3f &ray_org, const avx3f &ray_dir, const avx3f &ray_rdir, const avxf &ray_tnear, const avxf &ray_tfar, const avx3i& nearXYZ)
    {
      /*! stack state */
      NodeRef stack[stackSizeSingle];  //!< stack of nodes that still need to get traversed
      NodeRef* stackPtr = stack+1;        //!< current stack pointer
      NodeRef* stackEnd = stack+stackSizeSingle;
      stack[0]  = root;
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = nearXYZ.x[k];
      const size_t nearY = nearXYZ.y[k];
      const size_t nearZ = nearXYZ.z[k];
      
      /*! load the ray into SIMD registers */
      const avx3f org (ray_org .x[k],ray_org .y[k],ray_org .z[k]);
      const avx3f rdir(ray_rdir.x[k],ray_rdir.y[k],ray_rdir.z[k]);
      const avx3f norg = -org, org_rdir(org*rdir);
      const avxf rayNear(ray_tnear[k]), rayFar(ray_tfar[k]); 

      const Node     * __restrict__ nodes = (Node    *)bvh->nodePtr();
      const Triangle * __restrict__ accel = (Triangle*)bvh->triPtr();
      
      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = (NodeRef) *stackPtr;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(shadow.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = (Node*)cur.node(nodes);
          const size_t farX  = nearX ^ sizeof(avxf), farY  = nearY ^ sizeof(avxf), farZ  = nearZ ^ sizeof(avxf);
#if defined (__AVX2__)
          const avxf tNearX = msub(load8f((const char*)node+nearX), rdir.x, org_rdir.x);
          const avxf tNearY = msub(load8f((const char*)node+nearY), rdir.y, org_rdir.y);
          const avxf tNearZ = msub(load8f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const avxf tFarX  = msub(load8f((const char*)node+farX ), rdir.x, org_rdir.x);
          const avxf tFarY  = msub(load8f((const char*)node+farY ), rdir.y, org_rdir.y);
          const avxf tFarZ  = msub(load8f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const avxf tNearX = (norg.x + load8f((const char*)node+nearX)) * rdir.x;
          const avxf tNearY = (norg.y + load8f((const char*)node+nearY)) * rdir.y;
          const avxf tNearZ = (norg.z + load8f((const char*)node+nearZ)) * rdir.z;
          const avxf tFarX  = (norg.x + load8f((const char*)node+farX )) * rdir.x;
          const avxf tFarY  = (norg.y + load8f((const char*)node+farY )) * rdir.y;
          const avxf tFarZ  = (norg.z + load8f((const char*)node+farZ )) * rdir.z;
#endif
          
#if defined(__AVX2__)
          const avxf tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,rayNear));
          const avxf tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,rayFar ));
          const avxb vmask = cast(tNear) > cast(tFar);
          unsigned int mask = movemask(vmask)^0xff;
#else
          const avxf tNear = max(tNearX,tNearY,tNearZ,rayNear);
          const avxf tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
          const avxb vmask = tNear <= tFar;
          unsigned int mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4i::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH4i::emptyNode);
          assert(c1 != BVH4i::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd);
            if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
            else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
          }
          assert(stackPtr < stackEnd);
          *stackPtr = c0; stackPtr++;
          assert(stackPtr < stackEnd);
          *stackPtr = c1; stackPtr++;
          
          /*! three children are hit */
          r = __bscf(mask);
          cur = node->child(r); 
          assert(cur != BVH4i::emptyNode);
          if (likely(mask == 0)) continue;

	  while(1)
	    {
	      r = __bscf(mask);
	      NodeRef c = node->child(r); *stackPtr = c; stackPtr++;
	      if (unlikely(mask == 0)) break;
	    }
	  cur = (NodeRef) stackPtr[-1]; stackPtr--;

          // assert(stackPtr < stackEnd);
          // *stackPtr = cur; stackPtr++;
          
          // /*! four children are hit */
          // cur = node->child(3);
          // assert(cur != BVH4i::emptyNode);
        }
        
        /*! this is a leaf node */
        STAT3(shadow.trav_leaves,1,1,1);
        size_t num; Triangle* prim = (Triangle*) cur.leaf(accel,num);
        if (TriangleIntersector8::occluded(ray,k,prim,num,bvh->geometry)) {
          ray.geomID[k] = 0;
          return true;
        }
      }
      return false;
    }
    void BVH4iIntersector16Hybrid<LeafIntersector,ENABLE_COMPRESSED_BVH4I_NODES>::intersect(mic_i* valid_i, BVH4i* bvh, Ray16& ray16)
    {
      /* near and node stack */
      __aligned(64) mic_f   stack_dist[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node_single[3*BVH4i::maxDepth+1]; 

      /* load ray */
      const mic_m valid0     = *(mic_i*)valid_i != mic_i(0);
      const mic3f rdir16     = rcp_safe(ray16.dir);
      const mic3f org_rdir16 = ray16.org * rdir16;
      mic_f ray_tnear        = select(valid0,ray16.tnear,pos_inf);
      mic_f ray_tfar         = select(valid0,ray16.tfar ,neg_inf);
      const mic_f inf        = mic_f(pos_inf);
      
      /* allocate stack and push root node */
      stack_node[0] = BVH4i::invalidNode;
      stack_dist[0] = inf;
      stack_node[1] = bvh->root;
      stack_dist[1] = ray_tnear; 
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      mic_f*   __restrict__ sptr_dist = stack_dist + 2;
      
      const Node      * __restrict__ nodes = (Node     *)bvh->nodePtr();
      const Triangle1 * __restrict__ accel = (Triangle1*)bvh->triPtr();

      while (1) pop:
      {
        /* pop next node from stack */
        NodeRef curNode = *(sptr_node-1);
        mic_f curDist   = *(sptr_dist-1);
        sptr_node--;
        sptr_dist--;
	const mic_m m_stackDist = ray_tfar > curDist;

	/* stack emppty ? */
        if (unlikely(curNode == BVH4i::invalidNode))  break;
        
        /* cull node if behind closest hit point */
        if (unlikely(none(m_stackDist))) continue;
        
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////

	/* switch to single ray mode */
        if (unlikely(countbits(m_stackDist) <= BVH4i::hybridSIMDUtilSwitchThreshold)) 
	  {
	    float   *__restrict__ stack_dist_single = (float*)sptr_dist;
	    store16f(stack_dist_single,inf);

	    /* traverse single ray */	  	  
	    long rayIndex = -1;
	    while((rayIndex = bitscan64(rayIndex,m_stackDist)) != BITSCAN_NO_BIT_SET_64) 
	      {	    
		stack_node_single[0] = BVH4i::invalidNode;
		stack_node_single[1] = curNode;
		size_t sindex = 2;

		const mic_f org_xyz      = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z);
		const mic_f dir_xyz      = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z);
		const mic_f rdir_xyz     = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z);
		const mic_f org_rdir_xyz = org_xyz * rdir_xyz;
		const mic_f min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]);
		mic_f       max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]);

		const unsigned int leaf_mask = BVH4I_LEAF_MASK;

		while (1) 
		  {
		    NodeRef curNode = stack_node_single[sindex-1];
		    sindex--;
            
		    traverse_single_intersect<ENABLE_COMPRESSED_BVH4I_NODES>(curNode,
									     sindex,
									     rdir_xyz,
									     org_rdir_xyz,
									     min_dist_xyz,
									     max_dist_xyz,
									     stack_node_single,
									     stack_dist_single,
									     nodes,
									     leaf_mask);
	    

		    /* return if stack is empty */
		    if (unlikely(curNode == BVH4i::invalidNode)) break;


		    /* intersect one ray against four triangles */

		    const bool hit = LeafIntersector::intersect(curNode,
								rayIndex,
								dir_xyz,
								org_xyz,
								min_dist_xyz,
								max_dist_xyz,
								ray16,
								accel,
								(Scene*)bvh->geometry);

		    if (hit)
		      compactStack(stack_node_single,stack_dist_single,sindex,max_dist_xyz);
		  }
	      }
	    ray_tfar = select(valid0,ray16.tfar ,neg_inf);
	    continue;
	  }

	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////

	const unsigned int leaf_mask = BVH4I_LEAF_MASK;

	const mic3f org = ray16.org;
	const mic3f dir = ray16.dir;

        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf(leaf_mask))) break;
          
          STAT3(normal.trav_nodes,1,popcnt(ray_tfar > curDist),16);
          const Node* __restrict__ const node = curNode.node(nodes);

	  prefetch<PFHINT_L1>((mic_f*)node + 0); 
	  prefetch<PFHINT_L1>((mic_f*)node + 1); 
          
          /* pop of next node */
          sptr_node--;
          sptr_dist--;
          curNode = *sptr_node; 
          curDist = *sptr_dist;
          

#pragma unroll(4)
          for (unsigned int i=0; i<4; i++)
          {
	    BVH4i::NodeRef child;
	    mic_f lclipMinX,lclipMinY,lclipMinZ;
	    mic_f lclipMaxX,lclipMaxY,lclipMaxZ;

	    if (!ENABLE_COMPRESSED_BVH4I_NODES)
	      {
		child = node->lower[i].child;

		lclipMinX = msub(node->lower[i].x,rdir16.x,org_rdir16.x);
		lclipMinY = msub(node->lower[i].y,rdir16.y,org_rdir16.y);
		lclipMinZ = msub(node->lower[i].z,rdir16.z,org_rdir16.z);
		lclipMaxX = msub(node->upper[i].x,rdir16.x,org_rdir16.x);
		lclipMaxY = msub(node->upper[i].y,rdir16.y,org_rdir16.y);
		lclipMaxZ = msub(node->upper[i].z,rdir16.z,org_rdir16.z);
	      }
	    else
	      {
		BVH4i::QuantizedNode* __restrict__ const compressed_node = (BVH4i::QuantizedNode*)node;
		child = compressed_node->child(i);

		const mic_f startXYZ = compressed_node->decompress_startXYZ();
		const mic_f diffXYZ  = compressed_node->decompress_diffXYZ();
		const mic_f clower   = compressed_node->decompress_lowerXYZ(startXYZ,diffXYZ);
		const mic_f cupper   = compressed_node->decompress_upperXYZ(startXYZ,diffXYZ);

		lclipMinX = msub(mic_f(clower[4*i+0]),rdir16.x,org_rdir16.x);
		lclipMinY = msub(mic_f(clower[4*i+1]),rdir16.y,org_rdir16.y);
		lclipMinZ = msub(mic_f(clower[4*i+2]),rdir16.z,org_rdir16.z);
		lclipMaxX = msub(mic_f(cupper[4*i+0]),rdir16.x,org_rdir16.x);
		lclipMaxY = msub(mic_f(cupper[4*i+1]),rdir16.y,org_rdir16.y);
		lclipMaxZ = msub(mic_f(cupper[4*i+2]),rdir16.z,org_rdir16.z);		
	      }
	    
	    if (unlikely(i >=2 && child == BVH4i::invalidNode)) break;

            const mic_f lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const mic_f lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const mic_m lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);   
	    const mic_f childDist = select(lhit,lnearP,inf);
            const mic_m m_child_dist = childDist < curDist;


            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              sptr_node++;
              sptr_dist++;

              /* push cur node onto stack and continue with hit child */
              if (any(m_child_dist))
              {
                *(sptr_node-1) = curNode;
                *(sptr_dist-1) = curDist; 
                curDist = childDist;
                curNode = child;
              }              
              /* push hit child onto stack*/
              else 
		{
		  *(sptr_node-1) = child;
		  *(sptr_dist-1) = childDist; 

		  const char* __restrict__ const pnode = (char*)child.node(nodes);             
		  prefetch<PFHINT_L2>(pnode + 0);
		  prefetch<PFHINT_L2>(pnode + 64);
		}
              assert(sptr_node - stack_node < BVH4i::maxDepth);
            }	      
          }
#if SWITCH_ON_DOWN_TRAVERSAL == 1
	  const mic_m curUtil = ray_tfar > curDist;
	  if (unlikely(countbits(curUtil) <= BVH4i::hybridSIMDUtilSwitchThreshold))
	    {
	      *sptr_node++ = curNode;
	      *sptr_dist++ = curDist; 
	      goto pop;
	    }
#endif
        }
        
        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) break;
        

        /* intersect leaf */
        const mic_m m_valid_leaf = ray_tfar > curDist;
        STAT3(normal.trav_leaves,1,popcnt(m_valid_leaf),16);

	LeafIntersector::intersect16(curNode,m_valid_leaf,dir,org,ray16,accel,(Scene*)bvh->geometry);

        ray_tfar = select(m_valid_leaf,ray16.tfar,ray_tfar);
      }
示例#23
0
    void BVH8Intersector1<robust,PrimitiveIntersector>::occluded(const BVH8* bvh, Ray& ray)
    {
      /*! perform per ray precalculations required by the primitive intersector */
      Precalculations pre(ray,bvh);

      /*! stack state */
      NodeRef stack[stackSize];  //!< stack of nodes that still need to get traversed
      NodeRef* stackPtr = stack+1;        //!< current stack pointer
      NodeRef* stackEnd = stack+stackSize;
      stack[0] = bvh->root;
        
      /* filter out invalid rays */
#if defined(RTCORE_IGNORE_INVALID_RAYS)
      if (!ray.valid()) return;
#endif

      /* verify correct input */
      assert(ray.tnear > -FLT_MIN);
      //assert(!(types & BVH4::FLAG_NODE_MB) || (ray.time >= 0.0f && ray.time <= 1.0f));

      /*! load the ray into SIMD registers */
      const Vec3f8 norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const Vec3f8 rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const Vec3f8 org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const float8  ray_near(ray.tnear);
      float8 ray_far(ray.tfar);
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray_rdir.x >= 0 ? 0*sizeof(float8) : 1*sizeof(float8);
      const size_t nearY = ray_rdir.y >= 0 ? 2*sizeof(float8) : 3*sizeof(float8);
      const size_t nearZ = ray_rdir.z >= 0 ? 4*sizeof(float8) : 5*sizeof(float8);

      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = (NodeRef) *stackPtr;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(shadow.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ sizeof(float8), farY  = nearY ^ sizeof(float8), farZ  = nearZ ^ sizeof(float8);
#if defined (__AVX2__)
          const float8 tNearX = msub(load8f((const char*)node+nearX), rdir.x, org_rdir.x);
          const float8 tNearY = msub(load8f((const char*)node+nearY), rdir.y, org_rdir.y);
          const float8 tNearZ = msub(load8f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const float8 tFarX  = msub(load8f((const char*)node+farX ), rdir.x, org_rdir.x);
          const float8 tFarY  = msub(load8f((const char*)node+farY ), rdir.y, org_rdir.y);
          const float8 tFarZ  = msub(load8f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const float8 tNearX = (norg.x + load8f((const char*)node+nearX)) * rdir.x;
          const float8 tNearY = (norg.y + load8f((const char*)node+nearY)) * rdir.y;
          const float8 tNearZ = (norg.z + load8f((const char*)node+nearZ)) * rdir.z;
          const float8 tFarX  = (norg.x + load8f((const char*)node+farX )) * rdir.x;
          const float8 tFarY  = (norg.y + load8f((const char*)node+farY )) * rdir.y;
          const float8 tFarZ  = (norg.z + load8f((const char*)node+farZ )) * rdir.z;
#endif
          
#if defined(__AVX2__)
          const float8 tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray_near));
          const float8 tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray_far ));
          const bool8 vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xff;
#else
          const float8 tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const float8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const bool8 vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r); cur.prefetch(); 
            assert(cur != BVH8::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); c0.prefetch(); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); c1.prefetch(); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH8::emptyNode);
          assert(c1 != BVH8::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd);
            if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
            else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
          }
          assert(stackPtr < stackEnd);
          *stackPtr = c0; stackPtr++;
          assert(stackPtr < stackEnd);
          *stackPtr = c1; stackPtr++;
          
	  /*! three children are hit */
          r = __bscf(mask);
          cur = node->child(r); cur.prefetch(); *stackPtr = cur; stackPtr++;
          if (likely(mask == 0)) {
            stackPtr--;
            continue;
          }

	  /*! process more than three children */
	  while(1)
	  {
	    r = __bscf(mask);
	    NodeRef c = node->child(r); c.prefetch(); *stackPtr = c; stackPtr++;
	    if (unlikely(mask == 0)) break;
	  }
	  cur = (NodeRef) stackPtr[-1]; stackPtr--;
        }
        
        /*! this is a leaf node */
	assert(cur != BVH8::emptyNode);
        STAT3(shadow.trav_leaves,1,1,1);
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        size_t lazy_node = 0;
        if (PrimitiveIntersector::occluded(pre,ray,prim,num,bvh->scene,lazy_node)) {
          ray.geomID = 0;
          break;
        }

        if (unlikely(lazy_node)) {
          *stackPtr = (NodeRef)lazy_node;
          stackPtr++;
        }
      }
      AVX_ZERO_UPPER();
    }
示例#24
0
unsigned int *
mgcd(unsigned int *u, unsigned int *v)
{
	int i, k, n;
	unsigned int *t;

	if (MZERO(u)) {
		t = mcopy(v);
		MSIGN(t) = 1;
		return t;
	}

	if (MZERO(v)) {
		t = mcopy(u);
		MSIGN(t) = 1;
		return t;
	}

	u = mcopy(u);
	v = mcopy(v);

	MSIGN(u) = 1;
	MSIGN(v) = 1;

	k = 0;

	while ((u[0] & 1) == 0 && (v[0] & 1) == 0) {
		mshiftright(u);
		mshiftright(v);
		k++;
	}

	if (u[0] & 1) {
		t = mcopy(v);
		MSIGN(t) *= -1;
	} else
		t = mcopy(u);

	while (1) {

		while ((t[0] & 1) == 0)
			mshiftright(t);

		if (MSIGN(t) == 1) {
			mfree(u);
			u = mcopy(t);
		} else {
			mfree(v);
			v = mcopy(t);
			MSIGN(v) *= -1;
		}

		mfree(t);

		t = msub(u, v);

		if (MZERO(t)) {
			mfree(t);
			mfree(v);
			n = (k / 32) + 1;
			v = mnew(n);
			MSIGN(v) = 1;
			MLENGTH(v) = n;
			for (i = 0; i < n; i++)
				v[i] = 0;
			mp_set_bit(v, k);
			t = mmul(u, v);
			mfree(u);
			mfree(v);
			return t;
		}
	}
}
示例#25
0
NOMIPS16 int main()
{
  v2sf a, b, c, d, e, f;
  float f1, f2;

  f1 = 1.2;
  f2 = 3.4;
  a = init (f1, f2);
  b = (v2sf) {1.2, 3.4};
  if (!__builtin_mips_upper_c_eq_ps (a, b) ||
      !__builtin_mips_lower_c_eq_ps (a, b))
    abort ();

  a = (v2sf) {1.2, 2.3};
  b = (v2sf) {5.3, 6.1};
  b = move (a);

  if (!__builtin_mips_upper_c_eq_ps (a, b) ||
      !__builtin_mips_lower_c_eq_ps (a, b))
    abort ();

  a = (v2sf) {1.2, 2.3};
  b = (v2sf) {5.3, 6.1};
  c = add (a, b);
  d = (v2sf) {6.5, 8.4};
  if (!__builtin_mips_upper_c_eq_ps (c, d) ||
      !__builtin_mips_lower_c_eq_ps (c, d))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = sub (a, b);
  d = (v2sf) {-4, 6};
  if (!__builtin_mips_upper_c_eq_ps (c, d) ||
      !__builtin_mips_lower_c_eq_ps (c, d))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = mul (a, b);
  d = (v2sf) {5, 72};
  if (!__builtin_mips_upper_c_eq_ps (c, d) ||
      !__builtin_mips_lower_c_eq_ps (c, d))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = (v2sf) {5, 6};
  d = madd (a, b, c);
  e = (v2sf) {10, 78};
  if (!__builtin_mips_upper_c_eq_ps (d, e) ||
      !__builtin_mips_lower_c_eq_ps (d, e))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = (v2sf) {5, 6};
  d = msub (a, b, c);
  e = (v2sf) {0, 66};
  if (!__builtin_mips_upper_c_eq_ps (d, e) ||
      !__builtin_mips_lower_c_eq_ps (d, e))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = (v2sf) {5, 6};
  d = nmadd (a, b, c);
  e = (v2sf) {-10, -78};
  if (!__builtin_mips_upper_c_eq_ps (d, e) ||
      !__builtin_mips_lower_c_eq_ps (d, e))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = (v2sf) {5, 6};
  d = nmsub (a, b, c);
  e = (v2sf) {0, -66};
  if (!__builtin_mips_upper_c_eq_ps (d, e) ||
      !__builtin_mips_lower_c_eq_ps (d, e))
    abort ();

  a = (v2sf) {98, 12};
  b = neg (a);
  c = (v2sf) {-98, -12};
  if (!__builtin_mips_upper_c_eq_ps (b, c) ||
      !__builtin_mips_lower_c_eq_ps (b, c))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = cond_move1 (a, b, 1000);
  if (!__builtin_mips_upper_c_eq_ps (c, a) ||
      !__builtin_mips_lower_c_eq_ps (c, a))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = cond_move2 (a, b, -1000);
  if (!__builtin_mips_upper_c_eq_ps (c, b) ||
      !__builtin_mips_lower_c_eq_ps (c, b))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = cond_move3 (a, b, 9.0);
  if (!__builtin_mips_upper_c_eq_ps (c, a) ||
      !__builtin_mips_lower_c_eq_ps (c, a))
    abort ();

  a = (v2sf) {1, 12};
  b = (v2sf) {5, 6};
  c = cond_move4 (a, b, -10.0);
  if (!__builtin_mips_upper_c_eq_ps (c, b) ||
      !__builtin_mips_lower_c_eq_ps (c, b))
    abort ();

  a = (v2sf) {5, 12};
  b = (v2sf) {5, 6};
  c = (v2sf) {33, 123};
  d = (v2sf) {8, 78};
  e = __builtin_mips_movt_c_eq_ps (a, b, c, d);
  f = (v2sf) {8, 123};
  if (!__builtin_mips_upper_c_eq_ps (e, f) ||
      !__builtin_mips_lower_c_eq_ps (e, f))
    abort ();

  a = (v2sf) {5, 12};
  b = (v2sf) {5, 6};
  c = (v2sf) {33, 123};
  d = (v2sf) {8, 78};
  e = __builtin_mips_movf_c_eq_ps (a, b, c, d);
  f = (v2sf) {33, 78};
  if (!__builtin_mips_upper_c_eq_ps (e, f) ||
      !__builtin_mips_lower_c_eq_ps (e, f))
    abort ();

  a = load();
  b = (v2sf) {100, 200};
  if (!__builtin_mips_upper_c_eq_ps (a, b) ||
      !__builtin_mips_lower_c_eq_ps (a, b))
    abort ();

  a = (v2sf) {123, 321};
  store (a);
  b = load();
  if (!__builtin_mips_upper_c_eq_ps (a, b) ||
      !__builtin_mips_lower_c_eq_ps (a, b))
    abort ();

  printf ("Test Passes\n");
  exit (0);
}
示例#26
0
    void BVH8Intersector1<robust,PrimitiveIntersector>::intersect(const BVH8* bvh, Ray& ray)
    {
      /*! perform per ray precalculations required by the primitive intersector */
      Precalculations pre(ray,bvh);

      /*! stack state */
      StackItemT<NodeRef> stack[stackSize];  //!< stack of nodes 
      StackItemT<NodeRef>* stackPtr = stack+1;        //!< current stack pointer
      StackItemT<NodeRef>* stackEnd = stack+stackSize;
      stack[0].ptr = bvh->root;
      stack[0].dist = neg_inf;

      /* filter out invalid rays */
#if defined(RTCORE_IGNORE_INVALID_RAYS)
      if (!ray.valid()) return;
#endif

      /* verify correct input */
      assert(ray.tnear > -FLT_MIN);
      //assert(!(types & BVH4::FLAG_NODE_MB) || (ray.time >= 0.0f && ray.time <= 1.0f));

      /*! load the ray into SIMD registers */
      const Vec3f8 norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const Vec3f8 rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const Vec3f8 org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const float8  ray_near(ray.tnear);
      float8 ray_far(ray.tfar);

      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray_rdir.x >= 0.0f ? 0*sizeof(float8) : 1*sizeof(float8);
      const size_t nearY = ray_rdir.y >= 0.0f ? 2*sizeof(float8) : 3*sizeof(float8);
      const size_t nearZ = ray_rdir.z >= 0.0f ? 4*sizeof(float8) : 5*sizeof(float8);
      
      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = NodeRef(stackPtr->ptr);
        
        /*! if popped node is too far, pop next one */
        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
          continue;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(normal.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ sizeof(float8), farY  = nearY ^ sizeof(float8), farZ  = nearZ ^ sizeof(float8);
#if defined (__AVX2__)
          const float8 tNearX = msub(load8f((const char*)node+nearX), rdir.x, org_rdir.x);
          const float8 tNearY = msub(load8f((const char*)node+nearY), rdir.y, org_rdir.y);
          const float8 tNearZ = msub(load8f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const float8 tFarX  = msub(load8f((const char*)node+farX ), rdir.x, org_rdir.x);
          const float8 tFarY  = msub(load8f((const char*)node+farY ), rdir.y, org_rdir.y);
          const float8 tFarZ  = msub(load8f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const float8 tNearX = (norg.x + load8f((const char*)node+nearX)) * rdir.x;
          const float8 tNearY = (norg.y + load8f((const char*)node+nearY)) * rdir.y;
          const float8 tNearZ = (norg.z + load8f((const char*)node+nearZ)) * rdir.z;
          const float8 tFarX  = (norg.x + load8f((const char*)node+farX )) * rdir.x;
          const float8 tFarY  = (norg.y + load8f((const char*)node+farY )) * rdir.y;
          const float8 tFarZ  = (norg.z + load8f((const char*)node+farZ )) * rdir.z;
#endif

        const float round_down = 1.0f-2.0f*float(ulp);
        const float round_up   = 1.0f+2.0f*float(ulp);

#if defined(__AVX2__)
          const float8 tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray_near));
          const float8 tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray_far ));
          const bool8 vmask = robust ?  (round_down*tNear > round_up*tFar) : cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xff;
#else
          const float8 tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const float8 tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const bool8 vmask = robust ?  (round_down*tNear > round_up*tFar) : tNear <= tFar;
          size_t mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r); cur.prefetch();
            assert(cur != BVH8::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); c0.prefetch(); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); c1.prefetch(); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH8::emptyNode);
          assert(c1 != BVH8::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          }
          
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); c.prefetch(); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH8::emptyNode);
          if (likely(mask == 0)) {
            sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
            continue;
          }
          
	  /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          r = __bscf(mask);
          c = node->child(r); c.prefetch(); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
	  if (likely(mask == 0)) {
	    sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
	    cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
	    continue;
	  }

	  /*! fallback case if more than 4 children are hit */
	  while (1)
	  {
	    r = __bscf(mask);
	    assert(stackPtr < stackEnd);
	    c = node->child(r); c.prefetch(); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
	    if (unlikely(mask == 0)) break;
	  }
	  
	  cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
	}
        
        /*! this is a leaf node */
	assert(cur != BVH8::emptyNode);
        STAT3(normal.trav_leaves,1,1,1);
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        size_t lazy_node = 0;
        PrimitiveIntersector::intersect(pre,ray,prim,num,bvh->scene,lazy_node);
        ray_far = ray.tfar;

        if (unlikely(lazy_node)) {
          stackPtr->ptr = lazy_node;
          stackPtr->dist = inf;
          stackPtr++;
        }
      }

      AVX_ZERO_UPPER();
    }
示例#27
0
    void BVH4Intersector1<PrimitiveIntersector>::intersect(const BVH4* bvh, Ray& ray)
    {
      /*! stack state */
      StackItemInt32<NodeRef> stack[stackSize];  //!< stack of nodes 
      StackItemInt32<NodeRef>* stackPtr = stack+1;        //!< current stack pointer
      StackItemInt32<NodeRef>* stackEnd = stack+stackSize;
      stack[0].ptr = bvh->root;
      stack[0].dist = neg_inf;
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray.dir.x >= 0.0f ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray.dir.y >= 0.0f ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray.dir.z >= 0.0f ? 4*sizeof(ssef) : 5*sizeof(ssef);
      
#if 0 // FIXME: why is this slower
      /*! load the ray */
      Vec3fa ray_org = ray.org;
      Vec3fa ray_dir = ray.dir;
      ssef ray_near  = max(ray.tnear,FLT_MIN); // we do not support negative tnear values in this kernel due to integer optimizations
      ssef ray_far   = ray.tfar; 
#if defined(__FIX_RAYS__)
      const float float_range = 0.1f*FLT_MAX;
      ray_org = clamp(ray_org,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_dir = clamp(ray_dir,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_far = min(ray_far,float(inf)); 
#endif
      const Vec3fa ray_rdir = rcp_safe(ray_dir);
      const sse3f org(ray_org), dir(ray_dir);
      const sse3f norg(-ray_org), rdir(ray_rdir), org_rdir(ray_org*ray_rdir);
#else
      /*! load the ray into SIMD registers */
      const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const ssef  ray_near(ray.tnear);
      ssef ray_far(ray.tfar);
#endif

      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = NodeRef(stackPtr->ptr);
        
        /*! if popped node is too far, pop next one */
        if (unlikely(*(float*)&stackPtr->dist > ray.tfar))
          continue;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(normal.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;
#endif

#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray_near));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray_far ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
#else
          const ssef tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd); 
            if (d0 < d1) { stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++; cur = c0; continue; }
            else         { stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++; cur = c1; continue; }
          }
          
          /*! Here starts the slow path for 3 or 4 hit children. We push
           *  all nodes onto the stack to sort them there. */
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c0; stackPtr->dist = d0; stackPtr++;
          assert(stackPtr < stackEnd); 
          stackPtr->ptr = c1; stackPtr->dist = d1; stackPtr++;
          
          /*! three children are hit, push all onto stack and sort 3 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          NodeRef c = node->child(r); unsigned int d = ((unsigned int*)&tNear)[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          if (likely(mask == 0)) {
            sort(stackPtr[-1],stackPtr[-2],stackPtr[-3]);
            cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
            continue;
          }
          
          /*! four children are hit, push all onto stack and sort 4 stack items, continue with closest child */
          assert(stackPtr < stackEnd); 
          r = __bscf(mask);
          c = node->child(r); d = *(unsigned int*)&tNear[r]; stackPtr->ptr = c; stackPtr->dist = d; stackPtr++;
          assert(c != BVH4::emptyNode);
          sort(stackPtr[-1],stackPtr[-2],stackPtr[-3],stackPtr[-4]);
          cur = (NodeRef) stackPtr[-1].ptr; stackPtr--;
        }
        
        /*! this is a leaf node */
        STAT3(normal.trav_leaves,1,1,1);
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        PrimitiveIntersector::intersect(ray,prim,num,bvh->geometry);
        ray_far = ray.tfar;
      }
    }
示例#28
0
  bool BVH4iIntersector1<TriangleIntersector>::occluded(const BVH4iIntersector1* This, Ray& ray)
  {
    AVX_ZERO_UPPER();
    STAT3(shadow.travs,1,1,1);
    
    /*! stack state */
    const BVH4i* bvh = This->bvh;
    NodeRef stack[1+3*BVH4i::maxDepth];  //!< stack of nodes that still need to get traversed
    NodeRef* stackPtr = stack+1;        //!< current stack pointer
    stack[0]  = bvh->root;

    /*! offsets to select the side that becomes the lower or upper bound */
    const size_t nearX = ray.dir.x >= 0 ? 0*sizeof(ssef_m) : 1*sizeof(ssef_m);
    const size_t nearY = ray.dir.y >= 0 ? 2*sizeof(ssef_m) : 3*sizeof(ssef_m);
    const size_t nearZ = ray.dir.z >= 0 ? 4*sizeof(ssef_m) : 5*sizeof(ssef_m);
    
    /*! load the ray into SIMD registers */
    const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
    const Vector3f ray_rdir = rcp_safe(ray.dir);
    const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
    const Vector3f ray_org_rdir = ray.org*ray_rdir;
    const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
    const ssef  rayNear(ray.tnear);
    const ssef  rayFar(ray.tfar);

    const void* nodePtr = bvh->nodePtr();
    const void* triPtr  = bvh->triPtr();
    
    /* pop loop */
    while (true) pop:
    {
      /*! pop next node */
      if (unlikely(stackPtr == stack)) break;
      stackPtr--;
      NodeRef cur = (NodeRef) *stackPtr;
      
      /* downtraversal loop */
      while (true)
      {
        /*! stop if we found a leaf */
        if (unlikely(cur.isLeaf())) break;
        STAT3(shadow.trav_nodes,1,1,1);
    
        /*! single ray intersection with 4 boxes */
        const Node* node = cur.node(nodePtr);
        const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
        const ssef tNearX = msub(ssef((const char*)nodePtr+(size_t)cur+nearX), rdir.x, org_rdir.x);
        const ssef tNearY = msub(ssef((const char*)nodePtr+(size_t)cur+nearY), rdir.y, org_rdir.y);
        const ssef tNearZ = msub(ssef((const char*)nodePtr+(size_t)cur+nearZ), rdir.z, org_rdir.z);
        const ssef tFarX  = msub(ssef((const char*)nodePtr+(size_t)cur+farX ), rdir.x, org_rdir.x);
        const ssef tFarY  = msub(ssef((const char*)nodePtr+(size_t)cur+farY ), rdir.y, org_rdir.y);
        const ssef tFarZ  = msub(ssef((const char*)nodePtr+(size_t)cur+farZ ), rdir.z, org_rdir.z);
#else
        const ssef tNearX = (norg.x + ssef((const char*)nodePtr+(size_t)cur+nearX)) * rdir.x;
        const ssef tNearY = (norg.y + ssef((const char*)nodePtr+(size_t)cur+nearY)) * rdir.y;
        const ssef tNearZ = (norg.z + ssef((const char*)nodePtr+(size_t)cur+nearZ)) * rdir.z;
        const ssef tFarX  = (norg.x + ssef((const char*)nodePtr+(size_t)cur+farX )) * rdir.x;
        const ssef tFarY  = (norg.y + ssef((const char*)nodePtr+(size_t)cur+farY )) * rdir.y;
        const ssef tFarZ  = (norg.z + ssef((const char*)nodePtr+(size_t)cur+farZ )) * rdir.z;
#endif
        const ssef tNear = max(tNearX,tNearY,tNearZ,rayNear);
        const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,rayFar);
    size_t mask = movemask(tNear <= tFar);
        
        /*! if no child is hit, pop next node */
        if (unlikely(mask == 0))
          goto pop;

        /*! one child is hit, continue with that child */
        size_t r = __bsf(mask); mask = __btc(mask,r);
        if (likely(mask == 0)) {
          cur = node->child(r);
          continue;
        }

        /*! two children are hit, push far child, and continue with closer child */
        NodeRef c0 = node->child(r); const float d0 = tNear[r];
        r = __bsf(mask); mask = __btc(mask,r);
        NodeRef c1 = node->child(r); const float d1 = tNear[r];
        if (likely(mask == 0)) {
          if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
          else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
        }
        *stackPtr = c0; stackPtr++;
        *stackPtr = c1; stackPtr++;

        /*! three children are hit */
        r = __bsf(mask); mask = __btc(mask,r);
        cur = node->child(r); *stackPtr = cur; stackPtr++;
        if (likely(mask == 0)) {
          stackPtr--;
          continue;
        }

        /*! four children are hit */
        cur = node->child(3);
      }

      /*! this is a leaf node */
      STAT3(shadow.trav_leaves,1,1,1);
      size_t num; Triangle* tri = (Triangle*) cur.leaf(triPtr,num);
      for (size_t i=0; i<num; i++) {
        if (TriangleIntersector::occluded(ray,tri[i],bvh->vertices)) {
          AVX_ZERO_UPPER();
          return true;
        }
      }
    }
    AVX_ZERO_UPPER();
    return false;
  }
    void BVH4mbIntersector16Hybrid<LeafIntersector>::intersect(int16* valid_i, BVH4mb* bvh, Ray16& ray16)
    {
      /* near and node stack */
      __aligned(64) float16   stack_dist[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node[3*BVH4i::maxDepth+1];
      __aligned(64) NodeRef stack_node_single[3*BVH4i::maxDepth+1]; 

      /* load ray */
      const bool16 valid0     = *(int16*)valid_i != int16(0);
      const Vec3f16 rdir16     = rcp_safe(ray16.dir);
      const Vec3f16 org_rdir16 = ray16.org * rdir16;
      float16 ray_tnear        = select(valid0,ray16.tnear,pos_inf);
      float16 ray_tfar         = select(valid0,ray16.tfar ,neg_inf);
      const float16 inf        = float16(pos_inf);
      
      /* allocate stack and push root node */
      stack_node[0] = BVH4i::invalidNode;
      stack_dist[0] = inf;
      stack_node[1] = bvh->root;
      stack_dist[1] = ray_tnear; 
      NodeRef* __restrict__ sptr_node = stack_node + 2;
      float16*   __restrict__ sptr_dist = stack_dist + 2;
      
      const Node      * __restrict__ nodes = (Node     *)bvh->nodePtr();
      const BVH4mb::Triangle01 * __restrict__ accel = (BVH4mb::Triangle01 *)bvh->triPtr();

      while (1) pop:
      {
        /* pop next node from stack */
        NodeRef curNode = *(sptr_node-1);
        float16 curDist   = *(sptr_dist-1);
        sptr_node--;
        sptr_dist--;
	const bool16 m_stackDist = ray_tfar > curDist;

	/* stack emppty ? */
        if (unlikely(curNode == BVH4i::invalidNode))  break;
        
        /* cull node if behind closest hit point */
        if (unlikely(none(m_stackDist))) continue;
        
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////

	/* switch to single ray mode */
        if (unlikely(countbits(m_stackDist) <= BVH4i::hybridSIMDUtilSwitchThreshold)) 
	  {
	    float   *__restrict__ stack_dist_single = (float*)sptr_dist;
	    store16f(stack_dist_single,inf);

	    /* traverse single ray */	  	  
	    long rayIndex = -1;
	    while((rayIndex = bitscan64(rayIndex,m_stackDist)) != BITSCAN_NO_BIT_SET_64) 
	      {	    
		stack_node_single[0] = BVH4i::invalidNode;
		stack_node_single[1] = curNode;
		size_t sindex = 2;

		const float16 org_xyz      = loadAOS4to16f(rayIndex,ray16.org.x,ray16.org.y,ray16.org.z);
		const float16 dir_xyz      = loadAOS4to16f(rayIndex,ray16.dir.x,ray16.dir.y,ray16.dir.z);
		const float16 rdir_xyz     = loadAOS4to16f(rayIndex,rdir16.x,rdir16.y,rdir16.z);
		const float16 org_rdir_xyz = org_xyz * rdir_xyz;
		const float16 min_dist_xyz = broadcast1to16f(&ray16.tnear[rayIndex]);
		float16       max_dist_xyz = broadcast1to16f(&ray16.tfar[rayIndex]);
		const float16 time         = broadcast1to16f(&ray16.time[rayIndex]);

		const unsigned int leaf_mask = BVH4I_LEAF_MASK;

		while (1) 
		  {
		    NodeRef curNode = stack_node_single[sindex-1];
		    sindex--;
            
		    traverse_single_intersect(curNode,
					      sindex,
					      rdir_xyz,
					      org_rdir_xyz,
					      min_dist_xyz,
					      max_dist_xyz,
					      time,
					      stack_node_single,
					      stack_dist_single,
					      nodes,
					      leaf_mask);	    

		    /* return if stack is empty */
		    if (unlikely(curNode == BVH4i::invalidNode)) break;


		    /* intersect one ray against four triangles */
		    const bool hit = LeafIntersector::intersect(curNode,
								rayIndex,
								dir_xyz,
								org_xyz,
								min_dist_xyz,
								max_dist_xyz,
								ray16,
								accel,
								(Scene*)bvh->geometry);
		    
		    if (hit)
		      compactStack(stack_node_single,stack_dist_single,sindex,max_dist_xyz);

		  }	  
	      }
	    ray_tfar = select(valid0,ray16.tfar ,neg_inf);
	    continue;
	  }

	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////
	///////////////////////////////////////////////////////////////////////////////////////////////////////////////

	const unsigned int leaf_mask = BVH4I_LEAF_MASK;

	const float16 time     = ray16.time;
	const float16 one_time = (float16::one() - time);

        while (1)
        {
          /* test if this is a leaf node */
          if (unlikely(curNode.isLeaf(leaf_mask))) break;
          
          STAT3(normal.trav_nodes,1,popcnt(ray_tfar > curDist),16);
          const Node* __restrict__ const node = curNode.node(nodes);
          
          const BVH4mb::Node* __restrict__ const nodeMB = (BVH4mb::Node*)node;

          /* pop of next node */
          sptr_node--;
          sptr_dist--;
          curNode = *sptr_node; 
          curDist = *sptr_dist;
          
	  prefetch<PFHINT_L1>((char*)node + 0*64); 
	  prefetch<PFHINT_L1>((char*)node + 1*64); 
	  prefetch<PFHINT_L1>((char*)node + 2*64); 
	  prefetch<PFHINT_L1>((char*)node + 3*64); 

#pragma unroll(4)
          for (unsigned int i=0; i<4; i++)
          {
	    const NodeRef child = node->lower[i].child;

	    const float16 lower_x =  one_time * nodeMB->lower[i].x + time * nodeMB->lower_t1[i].x;
	    const float16 lower_y =  one_time * nodeMB->lower[i].y + time * nodeMB->lower_t1[i].y;
	    const float16 lower_z =  one_time * nodeMB->lower[i].z + time * nodeMB->lower_t1[i].z;
	    const float16 upper_x =  one_time * nodeMB->upper[i].x + time * nodeMB->upper_t1[i].x;
	    const float16 upper_y =  one_time * nodeMB->upper[i].y + time * nodeMB->upper_t1[i].y;
	    const float16 upper_z =  one_time * nodeMB->upper[i].z + time * nodeMB->upper_t1[i].z;

	    if (unlikely(i >=2 && child == BVH4i::invalidNode)) break;

            const float16 lclipMinX = msub(lower_x,rdir16.x,org_rdir16.x);
            const float16 lclipMinY = msub(lower_y,rdir16.y,org_rdir16.y);
            const float16 lclipMinZ = msub(lower_z,rdir16.z,org_rdir16.z);
            const float16 lclipMaxX = msub(upper_x,rdir16.x,org_rdir16.x);
            const float16 lclipMaxY = msub(upper_y,rdir16.y,org_rdir16.y);
            const float16 lclipMaxZ = msub(upper_z,rdir16.z,org_rdir16.z);
	    
            const float16 lnearP = max(max(min(lclipMinX, lclipMaxX), min(lclipMinY, lclipMaxY)), min(lclipMinZ, lclipMaxZ));
            const float16 lfarP  = min(min(max(lclipMinX, lclipMaxX), max(lclipMinY, lclipMaxY)), max(lclipMinZ, lclipMaxZ));
            const bool16 lhit   = max(lnearP,ray_tnear) <= min(lfarP,ray_tfar);   
	    const float16 childDist = select(lhit,lnearP,inf);
            const bool16 m_child_dist = childDist < curDist;
            /* if we hit the child we choose to continue with that child if it 
               is closer than the current next child, or we push it onto the stack */
            if (likely(any(lhit)))
            {
              sptr_node++;
              sptr_dist++;
              
              /* push cur node onto stack and continue with hit child */
              if (any(m_child_dist))
              {
                *(sptr_node-1) = curNode;
                *(sptr_dist-1) = curDist; 
                curDist = childDist;
                curNode = child;
              }              
              /* push hit child onto stack*/
              else 
		{
		  *(sptr_node-1) = child;
		  *(sptr_dist-1) = childDist; 
		}
              assert(sptr_node - stack_node < BVH4i::maxDepth);
            }	      
          }
#if SWITCH_ON_DOWN_TRAVERSAL == 1
	  const bool16 curUtil = ray_tfar > curDist;
	  if (unlikely(countbits(curUtil) <= BVH4i::hybridSIMDUtilSwitchThreshold))
	    {
	      *sptr_node++ = curNode;
	      *sptr_dist++ = curDist; 
	      goto pop;
	    }
#endif

        }
        
        /* return if stack is empty */
        if (unlikely(curNode == BVH4i::invalidNode)) break;
        

        /* intersect leaf */
        const bool16 m_valid_leaf = ray_tfar > curDist;
        STAT3(normal.trav_leaves,1,popcnt(m_valid_leaf),16);

	LeafIntersector::intersect16(curNode,
				     m_valid_leaf,
				     ray16.dir,
				     ray16.org,
				     ray16,
				     accel,
				     (Scene*)bvh->geometry);


        ray_tfar = select(m_valid_leaf,ray16.tfar,ray_tfar);
      }
示例#30
0
    void BVH4Intersector1<PrimitiveIntersector>::occluded(const BVH4* bvh, Ray& ray)
    {
      /*! stack state */
      NodeRef stack[stackSize];  //!< stack of nodes that still need to get traversed
      NodeRef* stackPtr = stack+1;        //!< current stack pointer
      NodeRef* stackEnd = stack+stackSize;
      stack[0] = bvh->root;
      
      /*! offsets to select the side that becomes the lower or upper bound */
      const size_t nearX = ray.dir.x >= 0 ? 0*sizeof(ssef) : 1*sizeof(ssef);
      const size_t nearY = ray.dir.y >= 0 ? 2*sizeof(ssef) : 3*sizeof(ssef);
      const size_t nearZ = ray.dir.z >= 0 ? 4*sizeof(ssef) : 5*sizeof(ssef);
      
#if 0 // FIXME: why is this slower
      /*! load the ray */
      Vec3fa ray_org = ray.org;
      Vec3fa ray_dir = ray.dir;
      ssef ray_near  = max(ray.tnear,FLT_MIN); // we do not support negative tnear values in this kernel due to integer optimizations
      ssef ray_far   = ray.tfar; 
#if defined(__FIX_RAYS__)
      const float float_range = 0.1f*FLT_MAX;
      ray_org = clamp(ray_org,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_dir = clamp(ray_dir,Vec3fa(-float_range),Vec3fa(+float_range));
      ray_far = min(ray_far,float(inf)); 
#endif
      const Vec3fa ray_rdir = rcp_safe(ray_dir);
      const sse3f org(ray_org), dir(ray_dir);
      const sse3f norg(-ray_org), rdir(ray_rdir), org_rdir(ray_org*ray_rdir);
#else
      /*! load the ray into SIMD registers */
      const sse3f norg(-ray.org.x,-ray.org.y,-ray.org.z);
      const Vec3fa ray_rdir = rcp_safe(ray.dir);
      const sse3f rdir(ray_rdir.x,ray_rdir.y,ray_rdir.z);
      const Vec3fa ray_org_rdir = ray.org*ray_rdir;
      const sse3f org_rdir(ray_org_rdir.x,ray_org_rdir.y,ray_org_rdir.z);
      const ssef  ray_near(ray.tnear);
      ssef ray_far(ray.tfar);
#endif
      
      /* pop loop */
      while (true) pop:
      {
        /*! pop next node */
        if (unlikely(stackPtr == stack)) break;
        stackPtr--;
        NodeRef cur = (NodeRef) *stackPtr;
        
        /* downtraversal loop */
        while (true)
        {
          /*! stop if we found a leaf */
          if (unlikely(cur.isLeaf())) break;
          STAT3(shadow.trav_nodes,1,1,1);
          
          /*! single ray intersection with 4 boxes */
          const Node* node = cur.node();
          const size_t farX  = nearX ^ 16, farY  = nearY ^ 16, farZ  = nearZ ^ 16;
#if defined (__AVX2__)
          const ssef tNearX = msub(load4f((const char*)node+nearX), rdir.x, org_rdir.x);
          const ssef tNearY = msub(load4f((const char*)node+nearY), rdir.y, org_rdir.y);
          const ssef tNearZ = msub(load4f((const char*)node+nearZ), rdir.z, org_rdir.z);
          const ssef tFarX  = msub(load4f((const char*)node+farX ), rdir.x, org_rdir.x);
          const ssef tFarY  = msub(load4f((const char*)node+farY ), rdir.y, org_rdir.y);
          const ssef tFarZ  = msub(load4f((const char*)node+farZ ), rdir.z, org_rdir.z);
#else
          const ssef tNearX = (norg.x + load4f((const char*)node+nearX)) * rdir.x;
          const ssef tNearY = (norg.y + load4f((const char*)node+nearY)) * rdir.y;
          const ssef tNearZ = (norg.z + load4f((const char*)node+nearZ)) * rdir.z;
          const ssef tFarX  = (norg.x + load4f((const char*)node+farX )) * rdir.x;
          const ssef tFarY  = (norg.y + load4f((const char*)node+farY )) * rdir.y;
          const ssef tFarZ  = (norg.z + load4f((const char*)node+farZ )) * rdir.z;
#endif
          
#if defined(__SSE4_1__)
          const ssef tNear = maxi(maxi(tNearX,tNearY),maxi(tNearZ,ray_near));
          const ssef tFar  = mini(mini(tFarX ,tFarY ),mini(tFarZ ,ray_far ));
          const sseb vmask = cast(tNear) > cast(tFar);
          size_t mask = movemask(vmask)^0xf;
#else
          const ssef tNear = max(tNearX,tNearY,tNearZ,ray_near);
          const ssef tFar  = min(tFarX ,tFarY ,tFarZ ,ray_far);
          const sseb vmask = tNear <= tFar;
          size_t mask = movemask(vmask);
#endif
          
          /*! if no child is hit, pop next node */
          if (unlikely(mask == 0))
            goto pop;
          
          /*! one child is hit, continue with that child */
          size_t r = __bscf(mask);
          if (likely(mask == 0)) {
            cur = node->child(r);
            assert(cur != BVH4::emptyNode);
            continue;
          }
          
          /*! two children are hit, push far child, and continue with closer child */
          NodeRef c0 = node->child(r); const unsigned int d0 = ((unsigned int*)&tNear)[r];
          r = __bscf(mask);
          NodeRef c1 = node->child(r); const unsigned int d1 = ((unsigned int*)&tNear)[r];
          assert(c0 != BVH4::emptyNode);
          assert(c1 != BVH4::emptyNode);
          if (likely(mask == 0)) {
            assert(stackPtr < stackEnd);
            if (d0 < d1) { *stackPtr = c1; stackPtr++; cur = c0; continue; }
            else         { *stackPtr = c0; stackPtr++; cur = c1; continue; }
          }
          assert(stackPtr < stackEnd);
          *stackPtr = c0; stackPtr++;
          assert(stackPtr < stackEnd);
          *stackPtr = c1; stackPtr++;
          
          /*! three children are hit */
          r = __bscf(mask);
          cur = node->child(r); 
          assert(cur != BVH4::emptyNode);
          if (likely(mask == 0)) continue;
          assert(stackPtr < stackEnd);
          *stackPtr = cur; stackPtr++;
          
          /*! four children are hit */
          cur = node->child(3);
          assert(cur != BVH4::emptyNode);
        }
        
        /*! this is a leaf node */
        STAT3(shadow.trav_leaves,1,1,1);
        size_t num; Primitive* prim = (Primitive*) cur.leaf(num);
        if (PrimitiveIntersector::occluded(ray,prim,num,bvh->geometry)) {
          ray.geomID = 0;
          break;
        }
      }
      AVX_ZERO_UPPER();
    }