void ALIGNED fast_light_process(
  struct Ray **rayplanes, struct Result **resplanes,
  struct VMState *states, int numstates,
  vec3f* lightpos,
  void* self
) {
  for (int i = 0; i < numstates; ++i) {
    struct VMState* sp = states++;
    
    if (sp->stream_ptr[0] != self) continue;
    sp->stream_ptr ++; sp->stream_len --;
    
    struct Result *res = resplanes[sp->resid - 1] + i;
    if (res->success) {
      v4sf nspos;
      {
        struct Ray *ray = rayplanes[sp->rayid - 1] + i;
        nspos = V4SF(ray->pos) + V4SF(ray->dir) * (v4sf) FOUR(res->distance * 0.999);
      }
      sp->rayid ++;
      v4sf lightdir = *(v4sf*) lightpos - nspos;
      v4sf lsq = lightdir * lightdir;
      float ldfac = 1 / sqrtf(XSUM(lsq));
      lightdir *= (v4sf) FOUR(ldfac);
      {
        struct Ray *ray = rayplanes[sp->rayid - 1] + i;
        V4SF(ray->pos) = nspos;
        V4SF(ray->dir) = lightdir;
      }
    } else {
      sp->stream_ptr += sp->stream_len - 1;
      sp->stream_len = 1;
    }
  }
}
void ALIGNED fast_checker_process(
  struct Ray **rayplanes, struct Result **resplanes,
  struct VMState *states, int numstates,
  vec3f a, vec3f b,
  void* self
) {
  for (int i = 0; i < numstates; ++i) {
    struct VMState* sp = states++;
    
    if (sp->stream_ptr[0] != self) continue;
    sp->stream_ptr ++; sp->stream_len --;
    
    struct Result *res = resplanes[sp->resid - 1] + i;
    struct Ray *ray = rayplanes[sp->rayid - 1] + i;
    if (res -> success) {
      v4sf hitpos = V4SF(ray->pos) + (v4sf) FOUR(res->distance) * V4SF(ray->dir);
      vec3f hitposv = *(vec3f*) &hitpos;
      res->emissive_col = (vec3f){0,0,0,0};
      // would overflow
      if (fabsf(hitposv.x) > INT_MAX || fabsf(hitposv.y) > INT_MAX || fabsf(hitposv.z) > INT_MAX) {
        v4sf temp = (V4SF(b) + V4SF(a)) / (v4sf) FOUR(2);
        res->col = *(vec3f*) &temp;
        continue;
      }
      int ix = (int) hitposv.x, iy = (int) hitposv.y, iz = (int) hitposv.z;
      if (hitposv.x < 0) ix --;
      if (hitposv.y < 0) iy --;
      if (hitposv.z < 0) iz --;
      if ((ix & 1) ^ (iy & 1) ^ (iz & 1))
        res->col = b;
      else
        res->col = a;
    }
  }
}
void ALIGNED ray_to_coordsf(int dw, int dh, struct Ray *rayp, float *xp, float *yp) {
  float ratio = dw * 1.0f / dh;
  v4sf dir = *(v4sf*) &rayp->dir;
  dir /= (v4sf) FOUR(Z(dir)); // denormalize
  *xp = (1.0f + (X(dir) / (ratio * fov))) * (dw / 2.0);
  *yp = (1.0f - (Y(dir) / fov)) * (dh / 2.0);
}
static int internal_rayHitsAABB(vec3f *abp, vec3f *p_ray, float *dist) {
#define ap &abp[0]
#define bp &abp[1]
#define p_pos &p_ray[0]
#define p_dir &p_ray[1]
  #define SF(VAR) (*(v4sf*) VAR)
  float dirprod = X(SF(p_dir)) * Y(SF(p_dir)) * Z(SF(p_dir));
  #undef SF
  v4si mask = (v4si) FOUR(1<<31);
  v4si signs = mask & *(v4si*)p_dir;
  v4sf a = (v4sf) (V4SI(*ap) ^ signs);
  v4sf b = (v4sf) (V4SI(*bp) ^ signs);
  v4sf pos = (v4sf) (V4SI(*p_pos) ^ signs);
  v4sf dir = (v4sf) (V4SI(*p_dir) ^ signs);
  v4sf b_ = b;
  // pretend ray starts at origin: -pos
  b = __builtin_ia32_maxps(a, b) - pos;
  
  // if (X(b) < 0 || Y(b) < 0 || Z(b) < 0) return 0; // ray is pointed away from aabb.
  v4si bsign = mask & *(v4si*)&b;
  if (IX(bsign) | IY(bsign) | IZ(bsign)) return 0;
  
  a = __builtin_ia32_minps(a, b_) - pos;
  // multiply every component with dir.(x*y*z)
  // vec3f dista = a / dir, distb = b / dir;
  vec3f *_vdir = (vec3f*) &dir;
  vec3f *_dista = (vec3f*) &a, *_distb = (vec3f*) &b;
#define vdir (*_vdir)
#define dista (*_dista)
#define distb (*_distb)
  
  if (LIKELY(vdir.x != 0 && vdir.y != 0 && vdir.z != 0)) {
    // vdir += (v4sf) {0, 0, 0, 1};
    *(v4si*) &dir &= (v4si) {-1, -1, -1, 0};
    dir += (v4sf) {0, 0, 0, 1};
    a /= dir;
    b /= dir;
  } else {
    if (LIKELY(vdir.x != 0)) { dista.x /= vdir.x; distb.x /= vdir.x; }
    else { dista.x = copysignf(INFINITY, dista.x); distb.x = copysignf(INFINITY, distb.x); }
    
    if (LIKELY(vdir.y != 0)) { dista.y /= vdir.y; distb.y /= vdir.y; }
    else { dista.y = copysignf(INFINITY, dista.y); distb.y = copysignf(INFINITY, distb.y); }
    
    if (LIKELY(vdir.z != 0)) { dista.z /= vdir.z; distb.z /= vdir.z; }
    else { dista.z = copysignf(INFINITY, dista.z); distb.z = copysignf(INFINITY, distb.z); }
  }
  float entry = fmaxf(dista.x, fmaxf(dista.y, dista.z));
  float exit = fminf(distb.x, fminf(distb.y, distb.z));
  if (dist) { *dist = entry; }
  return entry <= exit;
#undef dista
#undef vdir
#undef ap
#undef bp
#undef p_pos
#undef p_dir
}
void ALIGNED coordsf_to_ray(int dw, int dh, float x, float y, struct Ray *rayp) {
  float ratio = dw * 1.0f / dh;
  v4sf v = (v4sf) {ratio * fov * (x / (dw / 2.0) - 1.0), fov * (1.0 - y / (dh / 2.0)), 1.0, 0.0};
  v4sf res = v;
  v *= v;
  float f = 1.0f / sqrtf(*(float*) &v + *((float*) &v + 1) + *((float*) &v + 2));
  
  res *= (v4sf) FOUR(f);
  *(v4sf*) &rayp->pos = (v4sf) {0,2,0,0};
  *(v4sf*) &rayp->dir = (v4sf) res;
}
void fast_scale_process(
  struct Ray **rayplanes, struct Result **resplanes,
  struct VMState *states, int numstates,
  float factor,
  void* self
) {
  for (int i = 0; i < numstates; ++i) {
    struct VMState* sp = states++;
    
    if (sp->stream_ptr[0] != self) continue;
    sp->stream_ptr ++; sp->stream_len --;
    
    struct Ray *RAY  = rayplanes[sp->rayid - 1] + i;
    sp->rayid ++;
    struct Ray *RAY2 = rayplanes[sp->rayid - 1] + i;
    
    V4SF(RAY2->pos) = V4SF(RAY->pos) * (v4sf) FOUR(1/factor);
    V4SF(RAY2->dir) = V4SF(RAY->dir);
  }
}
Beispiel #7
0
#define CGU709  13959
#define CGV709  34996


/* calculation float resolution in bits */
/* ie RES = 6 is 10.6 fixed point */
/*    RES = 8 is 8.8 fixed point */
/*    RES = 4 is 12.4 fixed point */
/* NB: going above 6 will lead to overflow... :( */
#define RES    6

#define RZ(i)  (i >> (BITRES - RES))
#define FOUR(i) {i, i, i, i}

#ifdef BUILD_MMX
__attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
__attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
__attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
__attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
__attribute__ ((aligned (8))) const volatile unsigned short _const_ymul  [4] = FOUR(RZ(YMUL));
__attribute__ ((aligned (8))) const volatile unsigned short _const_128   [4] = FOUR(128);
__attribute__ ((aligned (8))) const volatile unsigned short _const_32    [4] = FOUR(RZ(OFF));
__attribute__ ((aligned (8))) const volatile unsigned short _const_16    [4] = FOUR(16);
__attribute__ ((aligned (8))) const volatile unsigned short _const_ff    [4] = FOUR(-1);

__attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv709[4] = FOUR(RZ(CRV709));
__attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu709[4] = FOUR(RZ(CBU709));
__attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu709[4] = FOUR(RZ(CGU709));
__attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv709[4] = FOUR(RZ(CGV709));

#define CONST_CRVCRV *_const_crvcrv
static void __attribute__ ((hot)) internal_triangle_recurse(TriangleNode *node, TriangleInfo *tlist, RecursionInfo *info, int *cache, int hash) {
  // __builtin_prefetch(&((TriangleInfo*) ROUND16(node + 1))->a);
  // printf("early prefetch %p\n", &((TriangleInfo*) ROUND16(node+1))->a);
  if (node->children_length) {
    if (info->closest_res) {
      float fs;
      for (int i = 0; i < node->children_length; ++i) {
        if (rayHits(&node->children_ptr[i]->aabb, &info->pos, &fs) && fs < info->closest)
          internal_triangle_recurse(node->children_ptr[i], tlist, info, cache, hash);
      }
    } else {
      for (int i = 0; i < node->children_length; ++i) {
        if (rayHits(&node->children_ptr[i]->aabb, &info->pos, 0)) internal_triangle_recurse(node->children_ptr[i], tlist, info, cache, hash);
      }
    }
  } else {
    RecursionInfo rin = *info;
    for (int i = 0; i < node->length; ++i) {
      int id = node->info[i];
      // printf("%i: hash %i, compare %i: outcome %i\n", id, hash, cache[id].hash, cache[id].outcome);
      if (cache[id] == hash) continue; // already considered
      cache[id] = hash;
      TriangleInfo *ti = &tlist[id];
      v4sf v_1 = V4SF(ti->n) * (V4SF(rin.pos) - V4SF(ti->a));
      v4sf v_2 = V4SF(rin.dir) * V4SF(ti->n);
      // float dist = - XSUM(v_1) / XSUM(v_2);
      // if (dist < 0) continue;
      float f_1 = XSUM(v_1), f_2 = XSUM(v_2);
      if (f_2 == 0) continue;
      
      // float dist = - f_1 / f_2;
      // if (dist < 0 || dist > rin.closest) continue;
      if (- f_1 * f_2 < 0) continue;
      float dist = - f_1 / f_2;
      
      if (UNLIKELY(dist > rin.closest)) continue;
      
      v4sf p = V4SF(rin.pos) + (v4sf) FOUR(dist) * V4SF(rin.dir);
      v4sf v0 = V4SF(ti->c) - V4SF(ti->a);
      v4sf v1 = V4SF(ti->b) - V4SF(ti->a);
      v4sf v2 = p           - V4SF(ti->a);
      v4sf v00 = v0 * v0, v01 = v0 * v1, v11 = v1 * v1, v02 = v0 * v2, v12 = v1 * v2;
      float dot00 = XSUM(v00), dot01 = XSUM(v01), dot11 = XSUM(v11);
      float dot02 = XSUM(v02), dot12 = XSUM(v12);
      float invDenom = ti->invDenom;
      
      v4sf bogus;
      v4sf temp = __builtin_ia32_hsubps((v4sf) { dot11, dot01, dot00, dot01 } * (v4sf) { dot02, dot12, dot12, dot02 }, bogus);
      float u = X(temp) * invDenom;
      float v = Y(temp) * invDenom;
      // float u = X(__builtin_ia32_hsubps((v4sf) { dot11, dot01 } * (v4sf) { dot02, dot12 }, bogus)) * invDenom;
      // float v = X(__builtin_ia32_hsubps((v4sf) { dot00, dot01 } * (v4sf) { dot12, dot02 }, bogus)) * invDenom;
      // float u = (dot11 * dot02 - dot01 * dot12) * invDenom;
      // float v = (dot00 * dot12 - dot01 * dot02) * invDenom;
      if (UNLIKELY((u > 0) && (v > 0) && (u+v < 1))) {
        rin.closest_res = ti;
        rin.closest = dist;
        rin.uv = (vec2f) {u, v};
      }
    }
    *info = rin;
  }
// IMPORTANT: use -mstackrealign!
void ALIGNED fast_sphere_process(
  struct Ray **rayplanes, struct Result **resplanes,
  struct VMState *states, int numstates,
  vec3f center, float rsq,
  void* self
) {
// #define PREFETCH_HARD(X, READ, LOCALITY) \
//   __builtin_prefetch(X, READ, LOCALITY); \
//   __asm__ volatile ("" : : : "memory"); // force break
#define PREFETCH_HARD(X, READ, LOCALITY) \
  __builtin_prefetch(X, READ, LOCALITY); // volatile makes no difference
  for (int i = 0; i < numstates; ++i) {
    struct VMState* sp = states++;
    PREFETCH_HARD(sp, 1, 3);
    
    if (sp->stream_ptr[0] != self) continue;
    struct Ray* RAY = rayplanes[sp->rayid - 1] + i;
    
    sp->stream_ptr ++; sp->stream_len --;
    struct Result *res = resplanes[sp->resid ++] + i;
    PREFETCH_HARD(RAY, 0, 0);
    
    // pos = ray.pos - center; pretranslate so we can pretend we're a sphere around (0, 0, 0)
    v4sf pos = V4SF(RAY->pos) - V4SF(center);
    v4sf dir = V4SF(RAY->dir);
    
    // algo 1
    float k;
    {
      v4sf dp = dir * pos, dd = dir * dir;
      k = -XSUM(dp) / XSUM(dd);
    }
    v4sf p = pos + dir * (v4sf) FOUR(k);
    p *= p;
    float ps = XSUM(p);
    if (ps > rsq) { res->success = 0; continue; }
    
    float sq = sqrtf (rsq - ps);
    float k1 = k + sq, k2 = k - sq;
    // algo 2
    /*
    // prod = 2 * pos * dir
    v4sf prod = (v4sf) FOUR(2) * pos * dir;
    // p = sum(2 * pos * dir)
    float p = XSUM(prod);

    pos *= pos;
    float pos_sum = XSUM(pos);
    // pos_sum = sum(pos * pos)
    
    float inside = (p*p / 4 + rsq) - pos_sum;
    if (inside < 0) { res->success = 0; continue; }
    
    float sq = sqrtf(inside),
      k = - p / 2,
      k1 = k + sq,
      k2 = k - sq;
    */
    if (k1 < 0) { res->success = 0; continue; }
    
    res->success = 1;
    
    if (k2 > 0) res->distance = k2;
    else res->distance = k1;
    
    // col = (1, 1, 1)
    res->emissive_col = (vec3f){0,0,0,0};
    res->col = (vec3f){1,1,1,1};
    
    v4sf normal = V4SF(RAY->pos) + V4SF(RAY->dir) * (v4sf) FOUR(res->distance);
    // normal = (ray.pos + distance * ray.dir) - center
    normal = normal - V4SF(center);
    // normalize normal
    v4sf nprod = normal * normal;
    float nprodf = 1.0f / sqrtf(*(float*) &nprod + *((float*) &nprod + 1) + *((float*) &nprod + 2));
    // nprod = __builtin_ia32_rsqrtss(SUM(nprod));
    // normal *= __builtin_ia32_shufps(nprod, nprod, 0x0);
    normal *= (v4sf) FOUR(nprodf);
    V4SF(res->normal) = normal;
  }
}
Beispiel #10
0
#define CGV    53280
#define YMUL   76283
#define OFF    32768
#define BITRES 16

/* calculation float resolution in bits */
/* ie RES = 6 is 10.6 fixed point */
/*    RES = 8 is 8.8 fixed point */
/*    RES = 4 is 12.4 fixed point */
/* NB: going above 6 will lead to overflow... :( */
#define RES    6

#define RZ(i)  (i >> (BITRES - RES))
#define FOUR(i) {i, i, i, i}

__aligned(8) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV));
__aligned(8) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU));
__aligned(8) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU));
__aligned(8) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV));
__aligned(8) const volatile unsigned short _const_ymul  [4] = FOUR(RZ(YMUL));
__aligned(8) const volatile unsigned short _const_128   [4] = FOUR(128);
__aligned(8) const volatile unsigned short _const_32    [4] = FOUR(RZ(OFF));
__aligned(8) const volatile unsigned short _const_16    [4] = FOUR(16);

#define CONST_CRVCRV *_const_crvcrv
#define CONST_CBUCBU *_const_cbucbu
#define CONST_CGUCGU *_const_cgucgu
#define CONST_CGVCGV *_const_cgvcgv
#define CONST_YMUL   *_const_ymul
#define CONST_128    *_const_128
#define CONST_32     *_const_32