void ALIGNED fast_light_process( struct Ray **rayplanes, struct Result **resplanes, struct VMState *states, int numstates, vec3f* lightpos, void* self ) { for (int i = 0; i < numstates; ++i) { struct VMState* sp = states++; if (sp->stream_ptr[0] != self) continue; sp->stream_ptr ++; sp->stream_len --; struct Result *res = resplanes[sp->resid - 1] + i; if (res->success) { v4sf nspos; { struct Ray *ray = rayplanes[sp->rayid - 1] + i; nspos = V4SF(ray->pos) + V4SF(ray->dir) * (v4sf) FOUR(res->distance * 0.999); } sp->rayid ++; v4sf lightdir = *(v4sf*) lightpos - nspos; v4sf lsq = lightdir * lightdir; float ldfac = 1 / sqrtf(XSUM(lsq)); lightdir *= (v4sf) FOUR(ldfac); { struct Ray *ray = rayplanes[sp->rayid - 1] + i; V4SF(ray->pos) = nspos; V4SF(ray->dir) = lightdir; } } else { sp->stream_ptr += sp->stream_len - 1; sp->stream_len = 1; } } }
void ALIGNED fast_checker_process( struct Ray **rayplanes, struct Result **resplanes, struct VMState *states, int numstates, vec3f a, vec3f b, void* self ) { for (int i = 0; i < numstates; ++i) { struct VMState* sp = states++; if (sp->stream_ptr[0] != self) continue; sp->stream_ptr ++; sp->stream_len --; struct Result *res = resplanes[sp->resid - 1] + i; struct Ray *ray = rayplanes[sp->rayid - 1] + i; if (res -> success) { v4sf hitpos = V4SF(ray->pos) + (v4sf) FOUR(res->distance) * V4SF(ray->dir); vec3f hitposv = *(vec3f*) &hitpos; res->emissive_col = (vec3f){0,0,0,0}; // would overflow if (fabsf(hitposv.x) > INT_MAX || fabsf(hitposv.y) > INT_MAX || fabsf(hitposv.z) > INT_MAX) { v4sf temp = (V4SF(b) + V4SF(a)) / (v4sf) FOUR(2); res->col = *(vec3f*) &temp; continue; } int ix = (int) hitposv.x, iy = (int) hitposv.y, iz = (int) hitposv.z; if (hitposv.x < 0) ix --; if (hitposv.y < 0) iy --; if (hitposv.z < 0) iz --; if ((ix & 1) ^ (iy & 1) ^ (iz & 1)) res->col = b; else res->col = a; } } }
void ALIGNED ray_to_coordsf(int dw, int dh, struct Ray *rayp, float *xp, float *yp) { float ratio = dw * 1.0f / dh; v4sf dir = *(v4sf*) &rayp->dir; dir /= (v4sf) FOUR(Z(dir)); // denormalize *xp = (1.0f + (X(dir) / (ratio * fov))) * (dw / 2.0); *yp = (1.0f - (Y(dir) / fov)) * (dh / 2.0); }
static int internal_rayHitsAABB(vec3f *abp, vec3f *p_ray, float *dist) { #define ap &abp[0] #define bp &abp[1] #define p_pos &p_ray[0] #define p_dir &p_ray[1] #define SF(VAR) (*(v4sf*) VAR) float dirprod = X(SF(p_dir)) * Y(SF(p_dir)) * Z(SF(p_dir)); #undef SF v4si mask = (v4si) FOUR(1<<31); v4si signs = mask & *(v4si*)p_dir; v4sf a = (v4sf) (V4SI(*ap) ^ signs); v4sf b = (v4sf) (V4SI(*bp) ^ signs); v4sf pos = (v4sf) (V4SI(*p_pos) ^ signs); v4sf dir = (v4sf) (V4SI(*p_dir) ^ signs); v4sf b_ = b; // pretend ray starts at origin: -pos b = __builtin_ia32_maxps(a, b) - pos; // if (X(b) < 0 || Y(b) < 0 || Z(b) < 0) return 0; // ray is pointed away from aabb. v4si bsign = mask & *(v4si*)&b; if (IX(bsign) | IY(bsign) | IZ(bsign)) return 0; a = __builtin_ia32_minps(a, b_) - pos; // multiply every component with dir.(x*y*z) // vec3f dista = a / dir, distb = b / dir; vec3f *_vdir = (vec3f*) &dir; vec3f *_dista = (vec3f*) &a, *_distb = (vec3f*) &b; #define vdir (*_vdir) #define dista (*_dista) #define distb (*_distb) if (LIKELY(vdir.x != 0 && vdir.y != 0 && vdir.z != 0)) { // vdir += (v4sf) {0, 0, 0, 1}; *(v4si*) &dir &= (v4si) {-1, -1, -1, 0}; dir += (v4sf) {0, 0, 0, 1}; a /= dir; b /= dir; } else { if (LIKELY(vdir.x != 0)) { dista.x /= vdir.x; distb.x /= vdir.x; } else { dista.x = copysignf(INFINITY, dista.x); distb.x = copysignf(INFINITY, distb.x); } if (LIKELY(vdir.y != 0)) { dista.y /= vdir.y; distb.y /= vdir.y; } else { dista.y = copysignf(INFINITY, dista.y); distb.y = copysignf(INFINITY, distb.y); } if (LIKELY(vdir.z != 0)) { dista.z /= vdir.z; distb.z /= vdir.z; } else { dista.z = copysignf(INFINITY, dista.z); distb.z = copysignf(INFINITY, distb.z); } } float entry = fmaxf(dista.x, fmaxf(dista.y, dista.z)); float exit = fminf(distb.x, fminf(distb.y, distb.z)); if (dist) { *dist = entry; } return entry <= exit; #undef dista #undef vdir #undef ap #undef bp #undef p_pos #undef p_dir }
void ALIGNED coordsf_to_ray(int dw, int dh, float x, float y, struct Ray *rayp) { float ratio = dw * 1.0f / dh; v4sf v = (v4sf) {ratio * fov * (x / (dw / 2.0) - 1.0), fov * (1.0 - y / (dh / 2.0)), 1.0, 0.0}; v4sf res = v; v *= v; float f = 1.0f / sqrtf(*(float*) &v + *((float*) &v + 1) + *((float*) &v + 2)); res *= (v4sf) FOUR(f); *(v4sf*) &rayp->pos = (v4sf) {0,2,0,0}; *(v4sf*) &rayp->dir = (v4sf) res; }
void fast_scale_process( struct Ray **rayplanes, struct Result **resplanes, struct VMState *states, int numstates, float factor, void* self ) { for (int i = 0; i < numstates; ++i) { struct VMState* sp = states++; if (sp->stream_ptr[0] != self) continue; sp->stream_ptr ++; sp->stream_len --; struct Ray *RAY = rayplanes[sp->rayid - 1] + i; sp->rayid ++; struct Ray *RAY2 = rayplanes[sp->rayid - 1] + i; V4SF(RAY2->pos) = V4SF(RAY->pos) * (v4sf) FOUR(1/factor); V4SF(RAY2->dir) = V4SF(RAY->dir); } }
#define CGU709 13959 #define CGV709 34996 /* calculation float resolution in bits */ /* ie RES = 6 is 10.6 fixed point */ /* RES = 8 is 8.8 fixed point */ /* RES = 4 is 12.4 fixed point */ /* NB: going above 6 will lead to overflow... :( */ #define RES 6 #define RZ(i) (i >> (BITRES - RES)) #define FOUR(i) {i, i, i, i} #ifdef BUILD_MMX __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV)); __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU)); __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU)); __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV)); __attribute__ ((aligned (8))) const volatile unsigned short _const_ymul [4] = FOUR(RZ(YMUL)); __attribute__ ((aligned (8))) const volatile unsigned short _const_128 [4] = FOUR(128); __attribute__ ((aligned (8))) const volatile unsigned short _const_32 [4] = FOUR(RZ(OFF)); __attribute__ ((aligned (8))) const volatile unsigned short _const_16 [4] = FOUR(16); __attribute__ ((aligned (8))) const volatile unsigned short _const_ff [4] = FOUR(-1); __attribute__ ((aligned (8))) const volatile unsigned short _const_crvcrv709[4] = FOUR(RZ(CRV709)); __attribute__ ((aligned (8))) const volatile unsigned short _const_cbucbu709[4] = FOUR(RZ(CBU709)); __attribute__ ((aligned (8))) const volatile unsigned short _const_cgucgu709[4] = FOUR(RZ(CGU709)); __attribute__ ((aligned (8))) const volatile unsigned short _const_cgvcgv709[4] = FOUR(RZ(CGV709)); #define CONST_CRVCRV *_const_crvcrv
static void __attribute__ ((hot)) internal_triangle_recurse(TriangleNode *node, TriangleInfo *tlist, RecursionInfo *info, int *cache, int hash) { // __builtin_prefetch(&((TriangleInfo*) ROUND16(node + 1))->a); // printf("early prefetch %p\n", &((TriangleInfo*) ROUND16(node+1))->a); if (node->children_length) { if (info->closest_res) { float fs; for (int i = 0; i < node->children_length; ++i) { if (rayHits(&node->children_ptr[i]->aabb, &info->pos, &fs) && fs < info->closest) internal_triangle_recurse(node->children_ptr[i], tlist, info, cache, hash); } } else { for (int i = 0; i < node->children_length; ++i) { if (rayHits(&node->children_ptr[i]->aabb, &info->pos, 0)) internal_triangle_recurse(node->children_ptr[i], tlist, info, cache, hash); } } } else { RecursionInfo rin = *info; for (int i = 0; i < node->length; ++i) { int id = node->info[i]; // printf("%i: hash %i, compare %i: outcome %i\n", id, hash, cache[id].hash, cache[id].outcome); if (cache[id] == hash) continue; // already considered cache[id] = hash; TriangleInfo *ti = &tlist[id]; v4sf v_1 = V4SF(ti->n) * (V4SF(rin.pos) - V4SF(ti->a)); v4sf v_2 = V4SF(rin.dir) * V4SF(ti->n); // float dist = - XSUM(v_1) / XSUM(v_2); // if (dist < 0) continue; float f_1 = XSUM(v_1), f_2 = XSUM(v_2); if (f_2 == 0) continue; // float dist = - f_1 / f_2; // if (dist < 0 || dist > rin.closest) continue; if (- f_1 * f_2 < 0) continue; float dist = - f_1 / f_2; if (UNLIKELY(dist > rin.closest)) continue; v4sf p = V4SF(rin.pos) + (v4sf) FOUR(dist) * V4SF(rin.dir); v4sf v0 = V4SF(ti->c) - V4SF(ti->a); v4sf v1 = V4SF(ti->b) - V4SF(ti->a); v4sf v2 = p - V4SF(ti->a); v4sf v00 = v0 * v0, v01 = v0 * v1, v11 = v1 * v1, v02 = v0 * v2, v12 = v1 * v2; float dot00 = XSUM(v00), dot01 = XSUM(v01), dot11 = XSUM(v11); float dot02 = XSUM(v02), dot12 = XSUM(v12); float invDenom = ti->invDenom; v4sf bogus; v4sf temp = __builtin_ia32_hsubps((v4sf) { dot11, dot01, dot00, dot01 } * (v4sf) { dot02, dot12, dot12, dot02 }, bogus); float u = X(temp) * invDenom; float v = Y(temp) * invDenom; // float u = X(__builtin_ia32_hsubps((v4sf) { dot11, dot01 } * (v4sf) { dot02, dot12 }, bogus)) * invDenom; // float v = X(__builtin_ia32_hsubps((v4sf) { dot00, dot01 } * (v4sf) { dot12, dot02 }, bogus)) * invDenom; // float u = (dot11 * dot02 - dot01 * dot12) * invDenom; // float v = (dot00 * dot12 - dot01 * dot02) * invDenom; if (UNLIKELY((u > 0) && (v > 0) && (u+v < 1))) { rin.closest_res = ti; rin.closest = dist; rin.uv = (vec2f) {u, v}; } } *info = rin; }
// IMPORTANT: use -mstackrealign! void ALIGNED fast_sphere_process( struct Ray **rayplanes, struct Result **resplanes, struct VMState *states, int numstates, vec3f center, float rsq, void* self ) { // #define PREFETCH_HARD(X, READ, LOCALITY) \ // __builtin_prefetch(X, READ, LOCALITY); \ // __asm__ volatile ("" : : : "memory"); // force break #define PREFETCH_HARD(X, READ, LOCALITY) \ __builtin_prefetch(X, READ, LOCALITY); // volatile makes no difference for (int i = 0; i < numstates; ++i) { struct VMState* sp = states++; PREFETCH_HARD(sp, 1, 3); if (sp->stream_ptr[0] != self) continue; struct Ray* RAY = rayplanes[sp->rayid - 1] + i; sp->stream_ptr ++; sp->stream_len --; struct Result *res = resplanes[sp->resid ++] + i; PREFETCH_HARD(RAY, 0, 0); // pos = ray.pos - center; pretranslate so we can pretend we're a sphere around (0, 0, 0) v4sf pos = V4SF(RAY->pos) - V4SF(center); v4sf dir = V4SF(RAY->dir); // algo 1 float k; { v4sf dp = dir * pos, dd = dir * dir; k = -XSUM(dp) / XSUM(dd); } v4sf p = pos + dir * (v4sf) FOUR(k); p *= p; float ps = XSUM(p); if (ps > rsq) { res->success = 0; continue; } float sq = sqrtf (rsq - ps); float k1 = k + sq, k2 = k - sq; // algo 2 /* // prod = 2 * pos * dir v4sf prod = (v4sf) FOUR(2) * pos * dir; // p = sum(2 * pos * dir) float p = XSUM(prod); pos *= pos; float pos_sum = XSUM(pos); // pos_sum = sum(pos * pos) float inside = (p*p / 4 + rsq) - pos_sum; if (inside < 0) { res->success = 0; continue; } float sq = sqrtf(inside), k = - p / 2, k1 = k + sq, k2 = k - sq; */ if (k1 < 0) { res->success = 0; continue; } res->success = 1; if (k2 > 0) res->distance = k2; else res->distance = k1; // col = (1, 1, 1) res->emissive_col = (vec3f){0,0,0,0}; res->col = (vec3f){1,1,1,1}; v4sf normal = V4SF(RAY->pos) + V4SF(RAY->dir) * (v4sf) FOUR(res->distance); // normal = (ray.pos + distance * ray.dir) - center normal = normal - V4SF(center); // normalize normal v4sf nprod = normal * normal; float nprodf = 1.0f / sqrtf(*(float*) &nprod + *((float*) &nprod + 1) + *((float*) &nprod + 2)); // nprod = __builtin_ia32_rsqrtss(SUM(nprod)); // normal *= __builtin_ia32_shufps(nprod, nprod, 0x0); normal *= (v4sf) FOUR(nprodf); V4SF(res->normal) = normal; } }
#define CGV 53280 #define YMUL 76283 #define OFF 32768 #define BITRES 16 /* calculation float resolution in bits */ /* ie RES = 6 is 10.6 fixed point */ /* RES = 8 is 8.8 fixed point */ /* RES = 4 is 12.4 fixed point */ /* NB: going above 6 will lead to overflow... :( */ #define RES 6 #define RZ(i) (i >> (BITRES - RES)) #define FOUR(i) {i, i, i, i} __aligned(8) const volatile unsigned short _const_crvcrv[4] = FOUR(RZ(CRV)); __aligned(8) const volatile unsigned short _const_cbucbu[4] = FOUR(RZ(CBU)); __aligned(8) const volatile unsigned short _const_cgucgu[4] = FOUR(RZ(CGU)); __aligned(8) const volatile unsigned short _const_cgvcgv[4] = FOUR(RZ(CGV)); __aligned(8) const volatile unsigned short _const_ymul [4] = FOUR(RZ(YMUL)); __aligned(8) const volatile unsigned short _const_128 [4] = FOUR(128); __aligned(8) const volatile unsigned short _const_32 [4] = FOUR(RZ(OFF)); __aligned(8) const volatile unsigned short _const_16 [4] = FOUR(16); #define CONST_CRVCRV *_const_crvcrv #define CONST_CBUCBU *_const_cbucbu #define CONST_CGUCGU *_const_cgucgu #define CONST_CGVCGV *_const_cgvcgv #define CONST_YMUL *_const_ymul #define CONST_128 *_const_128 #define CONST_32 *_const_32