__mmask BVH4AOSTriangle1Intersector16Single::occluded(const BVH4AOSTriangle1Intersector16Single* This, Ray16& ray, const __mmask valid_i) { /* pointers to node array and triangle array */ const mic_m valid = valid_i; const BVH4AOS* bvh = This->bvh; const Node* const nodes = (const Node* ) bvh->nodePtr(); const Triangle1 *const triangles = (const Triangle1*) bvh->triPtr(); mic_i not_occluded = mic_i::minus_one(); long rayIndex = -1; while((rayIndex = bsf64(rayIndex,valid)) != MIC_NO_BIT_SET_64) { // === TODO: precompute SOAtoAOS transformation, load with 4x broadcast const mic3f ray_rdir = rcp_safe(ray.dir); const mic_f org_xyz = SOAtoAOS_4f(rayIndex,ray.org.x,ray.org.y,ray.org.z); const mic_f dir_xyz = SOAtoAOS_4f(rayIndex,ray.dir.x,ray.dir.y,ray.dir.z); const mic_f rdir_xyz = SOAtoAOS_4f(rayIndex,ray_rdir.x,ray_rdir.y,ray_rdir.z); const mic_f org_rdir_xyz = org_xyz * rdir_xyz; const mic_f min_dist_xyz = upconv1f(&ray.tnear[rayIndex]); const mic_f max_dist_xyz = upconv1f(&ray.tfar[rayIndex]); if (BVH4AOSTriangle1Intersector1::occluded1(bvh,nodes,triangles,bvh->root,rayIndex,org_xyz,dir_xyz,rdir_xyz,org_rdir_xyz,min_dist_xyz,max_dist_xyz)) not_occluded[rayIndex] = 0; } return valid & eq(not_occluded,mic_i::zero()); }
void BVH4AOSTriangle1Intersector16Single::intersect(const BVH4AOSTriangle1Intersector16Single* This, Ray16& ray, const __mmask valid_i) { /* pointers to node array and triangle array */ const mic_m valid = valid_i; const BVH4AOS* bvh = This->bvh; const Node* const nodes = (const Node* ) bvh->nodePtr(); const Triangle1 *const triangles = (const Triangle1*) bvh->triPtr(); long rayIndex = -1; while((rayIndex = bsf64(rayIndex,valid)) != MIC_NO_BIT_SET_64) { const mic3f ray_rdir = rcp_safe(ray.dir); const mic_f org_xyz = SOAtoAOS_4f(rayIndex,ray.org.x,ray.org.y,ray.org.z); const mic_f dir_xyz = SOAtoAOS_4f(rayIndex,ray.dir.x,ray.dir.y,ray.dir.z); const mic_f rdir_xyz = SOAtoAOS_4f(rayIndex,ray_rdir.x,ray_rdir.y,ray_rdir.z); const mic_f org_rdir_xyz = org_xyz * rdir_xyz; const mic_f min_dist_xyz = upconv1f(&ray.tnear[rayIndex]); const mic_f max_dist_xyz = upconv1f(&ray.tfar[rayIndex]); BVH4AOSTriangle1Intersector1::intersect1(bvh,nodes,triangles,bvh->root,rayIndex,org_xyz,dir_xyz,rdir_xyz,org_rdir_xyz,min_dist_xyz,max_dist_xyz,ray); } }
int mtrropt(u64t wc_addr, u64t wc_len, u32t *memlimit) { u32t reg; int ii, sv4idx = 0; mtrrentry save4[16]; memset(&save4,0,sizeof(save4)); *memlimit = 0; if (is_included(wc_addr,wc_len)<0 && is_intersection(wc_addr, wc_len)<0 && is_regavail(®)) { mtrr[reg].start = wc_addr; mtrr[reg].len = wc_len; mtrr[reg].cache = MTRRF_WC; mtrr[reg].on = 1; return 0; } // video memory in not in 4th GB if (wc_addr<_3GbLL || wc_addr+wc_len>_4GbLL) return OPTERR_VIDMEM3GB; /* turn off previous write combine on the same memory, but leave this block to catch low UC border successfully */ ii = is_include(wc_addr, wc_len); if (ii>=0 && mtrr[ii].cache==MTRRF_WC) mtrr[ii].cache=MTRRF_UC; // only WB and UC allowed in first 4Gb for (ii=0; ii<regs; ii++) if (mtrr[ii].on && mtrr[ii].cache!=MTRRF_UC && mtrr[ii].cache!=MTRRF_WB && mtrr[ii].start<_4GbLL) return OPTERR_UNKCT; // is block intersected with someone? ii = is_intersection(wc_addr, wc_len); if (ii>=0) return OPTERR_INTERSECT; // remove/truncate all above 4Gb (but save it) for (ii=0; ii<regs; ii++) if (mtrr[ii].on) if (mtrr[ii].start<_4GbLL && mtrr[ii].start+mtrr[ii].len>_4GbLL) { u64t newlen = _4GbLL - mtrr[ii].start, remain = mtrr[ii].len - newlen; if (!is_power2(newlen)) return OPTERR_SPLIT4GB; mtrr[ii].len = newlen; // save block if (is_power2(remain)) { save4[sv4idx].start = _4GbLL; save4[sv4idx].len = remain; save4[sv4idx].cache = mtrr[ii].cache; sv4idx++; } else if (is_power2(remain/3)) { } } else if (mtrr[ii].start>=_4GbLL || mtrr[ii].start+mtrr[ii].len>_4GbLL) { save4[sv4idx].start = mtrr[ii].start; save4[sv4idx].len = mtrr[ii].len; save4[sv4idx].cache = mtrr[ii].cache; sv4idx++; clearreg(ii); } u64t wbend = 0, ucstart = FFFF64; // searching for upper WB border for (ii=0; ii<regs; ii++) if (mtrr[ii].on) if (mtrr[ii].cache==MTRRF_WB) if (mtrr[ii].start+mtrr[ii].len > wbend) wbend = mtrr[ii].start+mtrr[ii].len; // searching for lower UC border (but ignore small blocks) for (ii=0; ii<regs; ii++) if (mtrr[ii].on) if (mtrr[ii].cache==MTRRF_UC) { int pwr = bsf64(mtrr[ii].len); if (pwr>=27) { if (ucstart>mtrr[ii].start) ucstart = mtrr[ii].start; clearreg(ii); } } // pass #2 - removing small blocks above selected border for (ii=0; ii<regs; ii++) if (mtrr[ii].on) if (mtrr[ii].cache==MTRRF_UC && ucstart<=mtrr[ii].start) clearreg(ii); // if no UC entries - use the end of WB as border if (ucstart>wbend) ucstart = wbend; // this can occur on small video memory size (<128Mb) if (wc_addr<ucstart) return OPTERR_BELOWUC; // build new WB list if (ucstart<wbend) { if (ucstart<_1GbLL) return OPTERR_LOWUC; for (ii=0; ii<regs; ii++) if (mtrr[ii].on) if (mtrr[ii].cache==MTRRF_WB) clearreg(ii); int regsfree = regsavail() - sv4idx - 1; log_it(2, "regs free: %i \n", regsfree); // force 3 registers (some memory above 4Gb can be lost) if (regsfree<3) regsfree = 3; // split memory to list u64t nextpos = 0; ii = 0; for (u64t size=_2GbLL; size>=_64MbLL; size>>=1) { if (ucstart>=size) { if (!is_regavail(®)) return OPTERR_NOREG; mtrr[reg].start = nextpos; nextpos += (mtrr[reg].len = size); mtrr[reg].cache = MTRRF_WB; mtrr[reg].on = 1; ucstart -= size; // use only 3 mtrr regs if (++ii==regsfree) break; } } // save memlimit value *memlimit = nextpos>>20; /** and again removing small blocks above selected border... splitted blocks sum can be smaller than previously selected UC border and some blocks can be cleared here */ for (ii=0; ii<regs; ii++) if (mtrr[ii].on) if (mtrr[ii].cache==MTRRF_UC && nextpos<=mtrr[ii].start) clearreg(ii); } // final check if (is_included(wc_addr,wc_len)>=0 || is_intersection(wc_addr,wc_len)>=0 || is_include(wc_addr,wc_len)>=0) return OPTERR_OPTERR; // add entry if (is_regavail(®)) { mtrr[reg].start = wc_addr; mtrr[reg].len = wc_len; mtrr[reg].cache = MTRRF_WC; mtrr[reg].on = 1; } // restore some of above 4Gb memory blocks if (sv4idx && regsavail()>0) { for (ii=0; ii<sv4idx; ii++) { if (!is_regavail(®)) break; mtrr[reg].start = save4[ii].start; mtrr[reg].len = save4[ii].len; mtrr[reg].cache = save4[ii].cache; mtrr[reg].on = 1; } // check lost items for included UC entries while (ii<sv4idx) { if (mtrr[ii].cache==MTRRF_UC) { int idx = is_included(save4[ii].start,save4[ii].len); if (idx<0) idx = is_intersection(save4[ii].start,save4[ii].len); // check it multiple times (for intersection) if (idx>=0) { clearreg(idx); continue; } } ii++; } } return 0; }
/// return 0/1 static int is_power2(u64t length) { if (length) if ((u64t)1<<bsf64(length)==length) return 1; return 0; }