inline void avx2_hexid_to_ringid_segid_runid( const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid) { // if(hexid==0) { ringid = segid = runid = 0; return; } // return positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); avx2_positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); ringid = _mm256_andnot_si256(mask, ringid); segid = _mm256_andnot_si256(mask, segid); runid = _mm256_andnot_si256(mask, runid); }
inline void avx2_hexid_to_uv_ccw(const __m256i hexid, __m256i& u, __m256i& v) { // if(hexid==0) { u = v = 0; return; } // unsigned ringid; // unsigned segid; // unsigned runid; // positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid); // switch(segid) // { // case 0: u = ringid-runid; v = runid; break; // case 1: u = -runid; v = ringid; break; // case 2: u = -ringid; v = ringid-runid; break; // case 3: u = runid-ringid; v = -runid; break; // case 4: u = runid; v = -ringid; break; // case 5: u = ringid; v = runid-ringid; break; // default: assert(0); // } const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); }
inline void avx2_hexid_to_uv_cw(const __m256i hexid, __m256i& u, __m256i& v) { #if 0 // This code is correct but it's not worth maintaining two versions const __m256i one = _mm256_set1_epi32(1); __m256i ringid = avx2_positive_hexid_to_ringid(hexid); __m256i iring = _mm256_sub_epi32(hexid, avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one))); u = ringid; v = _mm256_setzero_si256(); __m256i irun = _mm256_min_epu32(iring, ringid); v = _mm256_sub_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_sub_epi32(u, irun); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); v = _mm256_add_epi32(v, irun); iring = _mm256_sub_epi32(iring, irun); irun = _mm256_min_epu32(iring, ringid); u = _mm256_add_epi32(u, irun); iring = _mm256_sub_epi32(iring, irun); u = _mm256_add_epi32(u, irun); v = _mm256_add_epi32(v, iring); const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); u = _mm256_andnot_si256(mask, u); v = _mm256_andnot_si256(mask, v); #else // hexid_to_uv_ccw(hexid, u, v); // u += v; // v = -v; avx2_hexid_to_uv_ccw(hexid, u, v); u = _mm256_add_epi32(u, v); v = _mm256_sign_epi32(v, _mm256_cmpeq_epi32(v, v)); #endif }
inline __m256i avx2_ringid_segid_runid_to_hexid( const __m256i ringid, const __m256i segid, const __m256i runid) { // return (ringid==0) ? 0 : // positive_ringid_segid_runid_to_hexid(ringid, segid, runid); const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256()); return _mm256_andnot_si256(mask, avx2_positive_ringid_segid_runid_to_hexid(ringid, segid, runid)); }
__SIMDi _SIMD_abs_epi32(__SIMDi a) { #ifdef USE_SSE return _mm_andnot_si128(_mm_set1_epi32(-0), a); #elif defined USE_AVX return _mm256_andnot_si256(_mm256_set1_epi32(-0), a); #elif defined USE_IBM return vec_abs(a); #endif }
inline avx_m256_t newcos_ps(avx_m256_t x) { x = _mm256_and_ps(x, _ps_inv_sign_mask); avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI); avx_m256i_t emm2 = _mm256_cvttps_epi32(y); emm2 = _mm256_add_epi32(emm2, _pi32_1); emm2 = _mm256_and_si256(emm2, _pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, _pi32_2); avx_m256i_t emm0 = _mm256_andnot_si256(emm2, _pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); emm2 = _mm256_and_si256(emm2, _pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); avx_m256_t sign_bit = _mm256_castsi256_ps(emm0); avx_m256_t poly_mask = _mm256_castsi256_ps(emm2); avx_m256_t temp = _ps_minus_cephes_DP123; temp = _mm256_mul_ps(y, temp); x = _mm256_add_ps(x, temp); avx_m256_t x2 = _mm256_mul_ps(x, x); avx_m256_t x3 = _mm256_mul_ps(x2, x); avx_m256_t x4 = _mm256_mul_ps(x2, x2); y = _ps_coscof_p0; avx_m256_t y2 = _ps_sincof_p0; y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p1); y2 = _mm256_add_ps(y2, _ps_sincof_p1); y = _mm256_mul_ps(y, x2); y2 = _mm256_mul_ps(y2, x2); y = _mm256_add_ps(y, _ps_coscof_p2); y2 = _mm256_add_ps(y2, _ps_sincof_p2); y = _mm256_mul_ps(y, x4); y2 = _mm256_mul_ps(y2, x3); temp = _mm256_mul_ps(x2, _ps_0p5); temp = _mm256_sub_ps(temp, _ps_1); y = _mm256_sub_ps(y, temp); y2 = _mm256_add_ps(y2, x); y = _mm256_andnot_ps(poly_mask, y); y2 = _mm256_and_ps(poly_mask, y2); y = _mm256_add_ps(y, y2); y = _mm256_xor_ps(y, sign_bit); return y; } // newcos_ps()
static INLINE void quantize(const __m256i *qp, __m256i *c, const int16_t *iscan_ptr, int log_scale, tran_low_t *qcoeff, tran_low_t *dqcoeff, __m256i *eob) { const __m256i abs_coeff = _mm256_abs_epi32(*c); __m256i q = _mm256_add_epi32(abs_coeff, qp[0]); __m256i q_lo = _mm256_mul_epi32(q, qp[1]); __m256i q_hi = _mm256_srli_epi64(q, 32); const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32); q_hi = _mm256_mul_epi32(q_hi, qp_hi); q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale); q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale); q_hi = _mm256_slli_epi64(q_hi, 32); q = _mm256_or_si256(q_lo, q_hi); const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale); const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s); q = _mm256_andnot_si256(mask, q); __m256i dq = _mm256_mullo_epi32(q, qp[2]); dq = _mm256_srai_epi32(dq, log_scale); q = _mm256_sign_epi32(q, *c); dq = _mm256_sign_epi32(dq, *c); _mm256_storeu_si256((__m256i *)qcoeff, q); _mm256_storeu_si256((__m256i *)dqcoeff, dq); const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr); const __m128i zr = _mm_setzero_si128(); const __m128i lo = _mm_unpacklo_epi16(isc, zr); const __m128i hi = _mm_unpackhi_epi16(isc, zr); const __m256i iscan = _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1); const __m256i zero = _mm256_setzero_si256(); const __m256i zc = _mm256_cmpeq_epi32(dq, zero); const __m256i nz = _mm256_cmpeq_epi32(zc, zero); __m256i cur_eob = _mm256_sub_epi32(iscan, nz); cur_eob = _mm256_and_si256(cur_eob, nz); *eob = _mm256_max_epi32(cur_eob, *eob); }
template <bool align> SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, size_t offset) { const __m256i _value = Load<align>((__m256i*)(value + offset)); const __m256i _background = Load<align>((__m256i*)(background + offset)); const __m256i _mask = Load<align>((const __m256i*)(mask + offset)); Store<align>((__m256i*)(background + offset), _mm256_or_si256(_mm256_and_si256(_mask, _value), _mm256_andnot_si256(_mask, _background))); }
__m256i test_mm256_andnot_si256(__m256i a, __m256i b) { // CHECK: xor <4 x i64> // CHECK: and <4 x i64> return _mm256_andnot_si256(a, b); }
15, 14, 13, 12)); /* The bits have now been shifted to the right locations; * translate their values 0..63 to the Base64 alphabet. * Because AVX2 can only compare 'greater than', start from end of alphabet: */ /* set 5: 63, "/" */ s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63)); blockmask = s5mask; /* set 4: 62, "+" */ s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62)); blockmask = _mm256_or_si256(blockmask, s4mask); /* set 3: 52..61, "0123456789" */ s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51))); blockmask = _mm256_or_si256(blockmask, s3mask); /* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */ s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25))); blockmask = _mm256_or_si256(blockmask, s2mask); /* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ" * Everything that is not blockmasked */ /* Create the masked character sets: */ str = _mm256_and_si256(_mm256_set1_epi8('/'), s5mask); str = _mm256_blendv_epi8(str, _mm256_set1_epi8('+'), s4mask); str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('0' - 52)), s3mask); str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('a' - 26)), s2mask); str = _mm256_blendv_epi8(_mm256_add_epi8(res, _mm256_set1_epi8('A')), str, blockmask);
__m256 mm256_cos_ps(__m256 x) { __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y; __m256i emm0, emm2; /* take the absolute value */ x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask); /* scale by 4/Pi */ y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI); /* store the integer part of y in mm0 */ emm2 = _mm256_cvttps_epi32(y); /* j=(j+1) & (~1) (see the cephes sources) */ emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1); emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1); y = _mm256_cvtepi32_ps(emm2); emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2); /* get the swap sign flag */ emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4); emm0 = _mm256_slli_epi32(emm0, 29); /* get the polynom selection mask */ emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2); emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256()); __m256 sign_bit = _mm256_castsi256_ps(emm0); __m256 poly_mask = _mm256_castsi256_ps(emm2); /* The magic pass: "******" x = ((x - y * DP1) - y * DP2) - y * DP3; */ xmm1 = *(__m256*)m256_ps_minus_cephes_DP1; xmm2 = *(__m256*)m256_ps_minus_cephes_DP2; xmm3 = *(__m256*)m256_ps_minus_cephes_DP3; xmm1 = _mm256_mul_ps(y, xmm1); xmm2 = _mm256_mul_ps(y, xmm2); xmm3 = _mm256_mul_ps(y, xmm3); x = _mm256_add_ps(x, xmm1); x = _mm256_add_ps(x, xmm2); x = _mm256_add_ps(x, xmm3); /* Evaluate the first polynom (0 <= x <= Pi/4) */ y = *(__m256*)m256_ps_coscof_p0; __m256 z = _mm256_mul_ps(x,x); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1); y = _mm256_mul_ps(y, z); y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2); y = _mm256_mul_ps(y, z); y = _mm256_mul_ps(y, z); __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5); y = _mm256_sub_ps(y, tmp); y = _mm256_add_ps(y, *(__m256*)m256_ps_1); /* Evaluate the second polynom (Pi/4 <= x <= 0) */ __m256 y2 = *(__m256*)m256_ps_sincof_p0; y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2); y2 = _mm256_mul_ps(y2, z); y2 = _mm256_mul_ps(y2, x); y2 = _mm256_add_ps(y2, x); /* select the correct result from the two polynoms */ xmm3 = poly_mask; y2 = _mm256_and_ps(xmm3, y2); //, xmm3); y = _mm256_andnot_ps(xmm3, y); y = _mm256_add_ps(y,y2); /* update the sign */ y = _mm256_xor_ps(y, sign_bit); _mm256_zeroupper(); return y; }
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val, int8_t missing, int8_t missing_substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } // body, SSE2 const __m128i val16 = _mm_set1_epi8(val); const __m128i miss16 = _mm_set1_epi8(missing); const __m128i sub16 = _mm_set1_epi8(missing_substitute); const __m128i mask = _mm_set1_epi16(0x00FF); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)out & 0x10)) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); n -= 16; out += 16; } const __m256i val32 = _mm256_set1_epi8(val); const __m256i miss32 = _mm256_set1_epi8(missing); const __m256i sub32 = _mm256_set1_epi8(missing_substitute); const __m256i mask2 = _mm256_set1_epi16(0x00FF); for (; n >= 32; n-=32) { __m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32; __m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2)); __m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8)); __m256i c = _mm256_setzero_si256(); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32)); c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32)); w1 = _mm256_cmpeq_epi8(v1, miss32); w2 = _mm256_cmpeq_epi8(v2, miss32); __m256i w = _mm256_or_si256(w1, w2); c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c)); c = _mm256_permute4x64_epi64(c, 0xD8); _mm256_store_si256((__m256i *)out, c); out += 32; } # endif // SSE2 only for (; n >= 16; n-=16) { __m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16; __m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask)); __m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8)); __m128i c = _mm_setzero_si128(); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16)); c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16)); w1 = _mm_cmpeq_epi8(v1, miss16); w2 = _mm_cmpeq_epi8(v2, miss16); __m128i w = _mm_or_si128(w1, w2); c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c)); _mm_store_si128((__m128i *)out, c); out += 16; } #endif // tail for (; n > 0; n--, p+=2) { *out ++ = ((p[0] == missing) || (p[1] == missing)) ? missing_substitute : (p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0); } }
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute) { #ifdef COREARRAY_SIMD_SSE2 // header 1, 16-byte aligned size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F; for (; (n > 0) && (h > 0); n--, h--, p++) if (*p == val) *p = substitute; // body, SSE2 const __m128i mask = _mm_set1_epi8(val); const __m128i sub = _mm_set1_epi8(substitute); # ifdef COREARRAY_SIMD_AVX2 // header 2, 32-byte aligned if ((n >= 16) && ((size_t)p & 0x10)) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) { _mm_store_si128((__m128i *)p, _mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v))); } n -= 16; p += 16; } const __m256i mask2 = _mm256_set1_epi8(val); const __m256i sub32 = _mm256_set1_epi8(substitute); const __m256i zero = _mm256_setzero_si256(); const __m256i ones = _mm256_cmpeq_epi64(zero, zero); for (; n >= 32; n-=32, p+=32) { __m256i v = _mm256_load_si256((__m256i const*)p); __m256i c = _mm256_cmpeq_epi8(v, mask2); if (_mm256_movemask_epi8(c)) { // TODO _mm256_store_si256((__m256i *)p, _mm256_or_si256(_mm256_and_si256(c, sub32), _mm256_andnot_si256(c, v))); } } # endif for (; n >= 16; n-=16, p+=16) { __m128i v = _mm_load_si128((__m128i const*)p); __m128i c = _mm_cmpeq_epi8(v, mask); if (_mm_movemask_epi8(c)) _mm_maskmoveu_si128(sub, c, (char*)p); } #endif // tail for (; n > 0; n--, p++) if (*p == val) *p = substitute; }
inline __m256i avx2_hexid_to_ringid(const __m256i hexid) { const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256()); return _mm256_andnot_si256(mask, avx2_positive_hexid_to_ringid(hexid)); }
inline __m256i avx2_uv_to_hexid_ccw(const __m256i u, const __m256i v) { // if(u==0 and v==0)return 0; // int ringid = uv_to_ringid(u,v); // unsigned segid; // int runid; // int upv = u+v; // if(upv==ringid and v!=ringid) { segid=0; runid=v; } // else if(v==ringid and u!=-ringid) { segid=1; runid=-u; } // else if(u==-ringid and upv!=-ringid) { segid=2; runid=ringid-v; } // else if(u+v==-ringid and v!=-ringid) { segid=3; runid=-v; } // else if(v==-ringid and u!=ringid) { segid=4; runid=u; } // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; } // return positive_ringid_segid_runid_to_hexid(ringid, segid, runid); const __m256i one = _mm256_set1_epi32(1); const __m256i minus_one = _mm256_set1_epi32(-1); const __m256i ringid = avx2_uv_to_ringid(u,v); const __m256i minus_ringid = _mm256_sign_epi32(ringid, minus_one); const __m256i upv = _mm256_add_epi32(u, v); __m256i not_found_mask = minus_one; __m256i hexid = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one)); // Seg ID = 0 // if(upv==ringid and v!=ringid) { segid=0; runid=v; } __m256i here_mask = _mm256_cmpeq_epi32(upv, ringid); hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(ringid, v, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_add_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, v), // _mm256_and_si256(not_found_mask, ringid))); // Seg ID = 1 // else if(v==ringid and u!=-ringid) { segid=1; runid=-u; } here_mask = _mm256_cmpeq_epi32(v, ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, u, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, u), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 2 // else if(u==-ringid and upv!=-ringid) { segid=2; runid=ringid-v; } here_mask = _mm256_cmpeq_epi32(u, minus_ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, upv, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, upv), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 3 // else if(u+v==-ringid and v!=-ringid) { segid=3; runid=-v; } here_mask = _mm256_cmpeq_epi32(upv, minus_ringid); hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(minus_ringid, v, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, v), // _mm256_and_si256(not_found_mask, minus_ringid))); // Seg ID = 4 // else if(v==-ringid and u!=ringid) { segid=4; runid=u; } here_mask = _mm256_cmpeq_epi32(v, minus_ringid); hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, _mm256_blendv_epi8(ringid, u, here_mask))); not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask); // hexid = _mm256_add_epi32(hexid, _mm256_or_si256( // _mm256_and_si256(here_mask, u), // _mm256_and_si256(not_found_mask, ringid))); // Seg ID = 5 // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; } hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, upv)); const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256()); hexid = _mm256_andnot_si256(mask, hexid); return hexid; }
//----------------------------------------------------------------------------------------- // Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer // If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee // as visible. If all rasterized AABB pixels are occluded then the occludee is culled //----------------------------------------------------------------------------------------- bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx) { // Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster) // Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. // so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040 _mm_setcsr( _mm_getcsr() | 0x8040 ); __m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3); __m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1); float* pDepthBuffer = (float*)pRenderTargetPixels; // Rasterize the AABB triangles 4 at a time for(UINT i = 0; i < AABB_TRIANGLES; i += SSE) { vFloat4 xformedPos[3]; Gather(xformedPos, i, pXformedPos, idx); // use fixed-point only for X and Y. Avoid work for Z and W. __m128i fxPtX[3], fxPtY[3]; for(int m = 0; m < 3; m++) { fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X); fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y); } // Fab(x, y) = Ax + By + C = 0 // Fab(x, y) = (ya - yb)x + (xb - xa)y + (xa * yb - xb * ya) = 0 // Compute A = (ya - yb) for the 3 line segments that make up each triangle __m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]); __m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]); __m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]); // Compute B = (xb - xa) for the 3 line segments that make up each triangle __m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]); __m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]); __m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]); // Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle __m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1])); __m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2])); __m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0])); // Compute triangle area __m128i triArea = _mm_mullo_epi32(B2, A1); triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2)); __m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea)); __m128 Z[3]; Z[0] = xformedPos[0].Z; Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea); Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea); // Use bounding box traversal strategy to determine which pixels to rasterize //__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3)); __m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1)); __m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1)); __m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1)); // Now we have 4 triangles set up. Rasterize them each individually. for(int lane=0; lane < SSE; lane++) { // Skip triangle if area is zero if(triArea.m128i_i32[lane] <= 0) { continue; } // Extract this triangle's properties from the SIMD versions __m256 zz[3]; for (int vv = 0; vv < 3; vv++) { zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]); } int startXx = startX.m128i_i32[lane]; int endXx = endX.m128i_i32[lane]; int startYy = startY.m128i_i32[lane]; int endYy = endY.m128i_i32[lane]; __m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]); __m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]); __m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]); __m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]); __m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]); __m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]); __m256i aa0Inc = _mm256_slli_epi32(aa0, 2); __m256i aa1Inc = _mm256_slli_epi32(aa1, 2); __m256i aa2Inc = _mm256_slli_epi32(aa2, 2); __m256i bb0Inc = _mm256_slli_epi32(bb0, 1); __m256i bb1Inc = _mm256_slli_epi32(bb1, 1); __m256i bb2Inc = _mm256_slli_epi32(bb2, 1); __m256i row, col; // Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X // This method provides better performance int rowIdx = (startYy * SCREENW + 2 * startXx); col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx)); __m256i aa0Col = _mm256_mullo_epi32(aa0, col); __m256i aa1Col = _mm256_mullo_epi32(aa1, col); __m256i aa2Col = _mm256_mullo_epi32(aa2, col); row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy)); __m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane])); __m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane])); __m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane])); __m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row); __m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row); __m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row); __m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]); zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2])); // Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY) for (int r = startYy; r < endYy; r += 2, rowIdx += 2 * SCREENW, sum0Row = _mm256_add_epi32(sum0Row, bb0Inc), sum1Row = _mm256_add_epi32(sum1Row, bb1Inc), sum2Row = _mm256_add_epi32(sum2Row, bb2Inc)) { // Compute barycentric coordinates int index = rowIdx; __m256i alpha = sum0Row; __m256i beta = sum1Row; __m256i gama = sum2Row; //Compute barycentric-interpolated depth __m256 depth = zz[0]; depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1])); depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2])); __m256i anyOut = _mm256_setzero_si256(); for (int c = startXx; c < endXx; c += 4, index += 8, alpha = _mm256_add_epi32(alpha, aa0Inc), beta = _mm256_add_epi32(beta, aa1Inc), gama = _mm256_add_epi32(gama, aa2Inc), depth = _mm256_add_ps(depth, zx)) { //Test Pixel inside triangle __m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama); __m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]); __m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D); __m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask)); anyOut = _mm256_or_si256(anyOut, finalMask); }//for each column if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000))) { return true; //early exit } }// for each row }// for each triangle }// for each set of SIMD# triangles return false; }