示例#1
0
inline void avx2_hexid_to_ringid_segid_runid(
  const __m256i hexid, __m256i& ringid, __m256i& segid, __m256i& runid)
{
  // if(hexid==0) { ringid = segid = runid = 0; return; }
  // return positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid);
  avx2_positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid);
  const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256());
  ringid = _mm256_andnot_si256(mask, ringid);
  segid = _mm256_andnot_si256(mask, segid);
  runid = _mm256_andnot_si256(mask, runid);
}
示例#2
0
inline void avx2_hexid_to_uv_ccw(const __m256i hexid, __m256i& u, __m256i& v)
{
  // if(hexid==0) { u = v = 0; return; }
  // unsigned ringid;
  // unsigned segid;
  // unsigned runid;
  // positive_hexid_to_ringid_segid_runid(hexid, ringid, segid, runid);
  // switch(segid)
  // {
  //   case 0: u = ringid-runid; v = runid;        break;
  //   case 1: u = -runid;       v = ringid;       break;
  //   case 2: u = -ringid;      v = ringid-runid; break;
  //   case 3: u = runid-ringid; v = -runid;       break;
  //   case 4: u = runid;        v = -ringid;      break;
  //   case 5: u = ringid;       v = runid-ringid; break;
  //   default: assert(0);
  // }
  const __m256i one = _mm256_set1_epi32(1);
  __m256i ringid = avx2_positive_hexid_to_ringid(hexid);
  __m256i iring = _mm256_sub_epi32(hexid,
    avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one)));

  u = ringid;
  v = _mm256_setzero_si256();

  __m256i irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_sub_epi32(u, irun);
  v = _mm256_add_epi32(v, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_sub_epi32(u, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  v = _mm256_sub_epi32(v, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_add_epi32(u, irun);
  v = _mm256_sub_epi32(v, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_add_epi32(u, irun);
  iring = _mm256_sub_epi32(iring, irun);

  v = _mm256_add_epi32(v, iring);

  const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256());
  u = _mm256_andnot_si256(mask, u);
  v = _mm256_andnot_si256(mask, v);
}
示例#3
0
inline void avx2_hexid_to_uv_cw(const __m256i hexid, __m256i& u, __m256i& v)
{
#if 0 // This code is correct but it's not worth maintaining two versions
  const __m256i one = _mm256_set1_epi32(1);
  __m256i ringid = avx2_positive_hexid_to_ringid(hexid);
  __m256i iring = _mm256_sub_epi32(hexid,
    avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid,one)));

  u = ringid;
  v = _mm256_setzero_si256();

  __m256i irun = _mm256_min_epu32(iring, ringid);
  v = _mm256_sub_epi32(v, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_sub_epi32(u, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_sub_epi32(u, irun);
  v = _mm256_add_epi32(v, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  v = _mm256_add_epi32(v, irun);
  iring = _mm256_sub_epi32(iring, irun);

  irun = _mm256_min_epu32(iring, ringid);
  u = _mm256_add_epi32(u, irun);
  iring = _mm256_sub_epi32(iring, irun);

  u = _mm256_add_epi32(u, irun);
  v = _mm256_add_epi32(v, iring);

  const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256());
  u = _mm256_andnot_si256(mask, u);
  v = _mm256_andnot_si256(mask, v);
#else
  // hexid_to_uv_ccw(hexid, u, v);
  // u += v;
  // v = -v;
  avx2_hexid_to_uv_ccw(hexid, u, v);
  u = _mm256_add_epi32(u, v);
  v = _mm256_sign_epi32(v, _mm256_cmpeq_epi32(v, v));
#endif
}
示例#4
0
inline __m256i avx2_ringid_segid_runid_to_hexid(
  const __m256i ringid, const __m256i segid, const __m256i runid)
{
  // return (ringid==0) ? 0 :
  //     positive_ringid_segid_runid_to_hexid(ringid, segid, runid);
  const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256());
  return _mm256_andnot_si256(mask,
    avx2_positive_ringid_segid_runid_to_hexid(ringid, segid, runid));
}
示例#5
0
__SIMDi _SIMD_abs_epi32(__SIMDi a)
{
#ifdef  USE_SSE
  return _mm_andnot_si128(_mm_set1_epi32(-0), a);
#elif defined USE_AVX
  return _mm256_andnot_si256(_mm256_set1_epi32(-0), a);
#elif defined USE_IBM
  return vec_abs(a);
#endif
}
示例#6
0
inline avx_m256_t newcos_ps(avx_m256_t x) {
	x = _mm256_and_ps(x, _ps_inv_sign_mask);
	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	emm2 = _mm256_add_epi32(emm2, _pi32_1);
	emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	y = _mm256_cvtepi32_ps(emm2);

	emm2 = _mm256_sub_epi32(emm2, _pi32_2);

	avx_m256i_t emm0 = _mm256_andnot_si256(emm2, _pi32_4);
	emm0 = _mm256_slli_epi32(emm0, 29);

	emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	
	avx_m256_t sign_bit = _mm256_castsi256_ps(emm0);
	avx_m256_t poly_mask = _mm256_castsi256_ps(emm2);
	
	avx_m256_t temp = _ps_minus_cephes_DP123;
	temp = _mm256_mul_ps(y, temp);
	x = _mm256_add_ps(x, temp);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp = _mm256_mul_ps(x2, _ps_0p5);
	temp = _mm256_sub_ps(temp, _ps_1);
	y = _mm256_sub_ps(y, temp);
	y2 = _mm256_add_ps(y2, x);

	y = _mm256_andnot_ps(poly_mask, y);
	y2 = _mm256_and_ps(poly_mask, y2);
	y = _mm256_add_ps(y, y2);

	y = _mm256_xor_ps(y, sign_bit);

	return y;
} // newcos_ps()
static INLINE void quantize(const __m256i *qp, __m256i *c,
                            const int16_t *iscan_ptr, int log_scale,
                            tran_low_t *qcoeff, tran_low_t *dqcoeff,
                            __m256i *eob) {
  const __m256i abs_coeff = _mm256_abs_epi32(*c);
  __m256i q = _mm256_add_epi32(abs_coeff, qp[0]);

  __m256i q_lo = _mm256_mul_epi32(q, qp[1]);
  __m256i q_hi = _mm256_srli_epi64(q, 32);
  const __m256i qp_hi = _mm256_srli_epi64(qp[1], 32);
  q_hi = _mm256_mul_epi32(q_hi, qp_hi);
  q_lo = _mm256_srli_epi64(q_lo, 16 - log_scale);
  q_hi = _mm256_srli_epi64(q_hi, 16 - log_scale);
  q_hi = _mm256_slli_epi64(q_hi, 32);
  q = _mm256_or_si256(q_lo, q_hi);
  const __m256i abs_s = _mm256_slli_epi32(abs_coeff, 1 + log_scale);
  const __m256i mask = _mm256_cmpgt_epi32(qp[2], abs_s);
  q = _mm256_andnot_si256(mask, q);

  __m256i dq = _mm256_mullo_epi32(q, qp[2]);
  dq = _mm256_srai_epi32(dq, log_scale);
  q = _mm256_sign_epi32(q, *c);
  dq = _mm256_sign_epi32(dq, *c);

  _mm256_storeu_si256((__m256i *)qcoeff, q);
  _mm256_storeu_si256((__m256i *)dqcoeff, dq);

  const __m128i isc = _mm_loadu_si128((const __m128i *)iscan_ptr);
  const __m128i zr = _mm_setzero_si128();
  const __m128i lo = _mm_unpacklo_epi16(isc, zr);
  const __m128i hi = _mm_unpackhi_epi16(isc, zr);
  const __m256i iscan =
      _mm256_insertf128_si256(_mm256_castsi128_si256(lo), hi, 1);

  const __m256i zero = _mm256_setzero_si256();
  const __m256i zc = _mm256_cmpeq_epi32(dq, zero);
  const __m256i nz = _mm256_cmpeq_epi32(zc, zero);
  __m256i cur_eob = _mm256_sub_epi32(iscan, nz);
  cur_eob = _mm256_and_si256(cur_eob, nz);
  *eob = _mm256_max_epi32(cur_eob, *eob);
}
		template <bool align> SIMD_INLINE void EdgeBackgroundShiftRangeMasked(const uint8_t * value, uint8_t * background, const uint8_t * mask, size_t offset)
		{
			const __m256i _value = Load<align>((__m256i*)(value + offset));
			const __m256i _background = Load<align>((__m256i*)(background + offset));
			const __m256i _mask = Load<align>((const __m256i*)(mask + offset));
			Store<align>((__m256i*)(background + offset), _mm256_or_si256(_mm256_and_si256(_mask, _value), _mm256_andnot_si256(_mask, _background)));
		}
示例#9
0
__m256i test_mm256_andnot_si256(__m256i a, __m256i b) {
  // CHECK: xor <4 x i64>
  // CHECK: and <4 x i64>
  return _mm256_andnot_si256(a, b);
}
示例#10
0
			15, 14, 13, 12));

	/* The bits have now been shifted to the right locations;
	 * translate their values 0..63 to the Base64 alphabet.
	 * Because AVX2 can only compare 'greater than', start from end of alphabet: */

	/* set 5: 63, "/" */
	s5mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(63));
	blockmask = s5mask;

	/* set 4: 62, "+" */
	s4mask = _mm256_cmpeq_epi8(res, _mm256_set1_epi8(62));
	blockmask = _mm256_or_si256(blockmask, s4mask);

	/* set 3: 52..61, "0123456789" */
	s3mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(51)));
	blockmask = _mm256_or_si256(blockmask, s3mask);

	/* set 2: 26..51, "abcdefghijklmnopqrstuvwxyz" */
	s2mask = _mm256_andnot_si256(blockmask, _mm256_cmpgt_epi8(res, _mm256_set1_epi8(25)));
	blockmask = _mm256_or_si256(blockmask, s2mask);

	/* set 1: 0..25, "ABCDEFGHIJKLMNOPQRSTUVWXYZ"
	 * Everything that is not blockmasked */

	/* Create the masked character sets: */
	str = _mm256_and_si256(_mm256_set1_epi8('/'), s5mask);
	str = _mm256_blendv_epi8(str, _mm256_set1_epi8('+'), s4mask);
	str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('0' - 52)), s3mask);
	str = _mm256_blendv_epi8(str, _mm256_add_epi8(res, _mm256_set1_epi8('a' - 26)), s2mask);
	str = _mm256_blendv_epi8(_mm256_add_epi8(res, _mm256_set1_epi8('A')), str, blockmask);
示例#11
0
__m256 mm256_cos_ps(__m256 x) {
  __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
  __m256i emm0, emm2;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI);

  /* store the integer part of y in mm0 */
  emm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1);
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1);
  y = _mm256_cvtepi32_ps(emm2);

  emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2);
  
  /* get the swap sign flag */
  emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4);
  emm0 = _mm256_slli_epi32(emm0, 29);
  /* get the polynom selection mask */
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2);
  emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
  
  __m256 sign_bit = _mm256_castsi256_ps(emm0);
  __m256 poly_mask = _mm256_castsi256_ps(emm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(__m256*)m256_ps_minus_cephes_DP1;
  xmm2 = *(__m256*)m256_ps_minus_cephes_DP2;
  xmm3 = *(__m256*)m256_ps_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(__m256*)m256_ps_coscof_p0;
  __m256 z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  __m256 y2 = *(__m256*)m256_ps_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  _mm256_zeroupper();
  return y;
}
示例#12
0
void vec_i8_cnt_dosage2(const int8_t *p, int8_t *out, size_t n, int8_t val,
	int8_t missing, int8_t missing_substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)out & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}

	// body, SSE2
	const __m128i val16  = _mm_set1_epi8(val);
	const __m128i miss16 = _mm_set1_epi8(missing);
	const __m128i sub16  = _mm_set1_epi8(missing_substitute);
	const __m128i mask   = _mm_set1_epi16(0x00FF);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)out & 0x10))
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w  = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		n -= 16; out += 16;
	}

	const __m256i val32  = _mm256_set1_epi8(val);
	const __m256i miss32 = _mm256_set1_epi8(missing);
	const __m256i sub32  = _mm256_set1_epi8(missing_substitute);
	const __m256i mask2  = _mm256_set1_epi16(0x00FF);

	for (; n >= 32; n-=32)
	{
		__m256i w1 = MM_LOADU_256((__m256i const*)p); p += 32;
		__m256i w2 = MM_LOADU_256((__m256i const*)p); p += 32;

		__m256i v1 = _mm256_packus_epi16(_mm256_and_si256(w1, mask2), _mm256_and_si256(w2, mask2));
		__m256i v2 = _mm256_packus_epi16(_mm256_srli_epi16(w1, 8), _mm256_srli_epi16(w2, 8));

		__m256i c = _mm256_setzero_si256();
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v1, val32));
		c = _mm256_sub_epi8(c, _mm256_cmpeq_epi8(v2, val32));

		w1 = _mm256_cmpeq_epi8(v1, miss32);
		w2 = _mm256_cmpeq_epi8(v2, miss32);
		__m256i w = _mm256_or_si256(w1, w2);
		c = _mm256_or_si256(_mm256_and_si256(w, sub32), _mm256_andnot_si256(w, c));

		c = _mm256_permute4x64_epi64(c, 0xD8);
		_mm256_store_si256((__m256i *)out, c);
		out += 32;
	}

#   endif

	// SSE2 only
	for (; n >= 16; n-=16)
	{
		__m128i w1 = MM_LOADU_128((__m128i const*)p); p += 16;
		__m128i w2 = MM_LOADU_128((__m128i const*)p); p += 16;

		__m128i v1 = _mm_packus_epi16(_mm_and_si128(w1, mask), _mm_and_si128(w2, mask));
		__m128i v2 = _mm_packus_epi16(_mm_srli_epi16(w1, 8), _mm_srli_epi16(w2, 8));

		__m128i c = _mm_setzero_si128();
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v1, val16));
		c = _mm_sub_epi8(c, _mm_cmpeq_epi8(v2, val16));

		w1 = _mm_cmpeq_epi8(v1, miss16);
		w2 = _mm_cmpeq_epi8(v2, miss16);
		__m128i w = _mm_or_si128(w1, w2);
		c = _mm_or_si128(_mm_and_si128(w, sub16), _mm_andnot_si128(w, c));

		_mm_store_si128((__m128i *)out, c);
		out += 16;
	}

#endif

	// tail
	for (; n > 0; n--, p+=2)
	{
		*out ++ = ((p[0] == missing) || (p[1] == missing)) ?
			missing_substitute :
			(p[0]==val ? 1 : 0) + (p[1]==val ? 1 : 0);
	}
}
示例#13
0
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p == val) *p = substitute;

	// body, SSE2
	const __m128i mask = _mm_set1_epi8(val);
	const __m128i sub  = _mm_set1_epi8(substitute);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
		{
			_mm_store_si128((__m128i *)p,
				_mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v)));
		}
		n -= 16; p += 16;
	}

	const __m256i mask2 = _mm256_set1_epi8(val);
	const __m256i sub32 = _mm256_set1_epi8(substitute);
	const __m256i zero = _mm256_setzero_si256();
	const __m256i ones = _mm256_cmpeq_epi64(zero, zero);

	for (; n >= 32; n-=32, p+=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p);
		__m256i c = _mm256_cmpeq_epi8(v, mask2);
		if (_mm256_movemask_epi8(c))
		{
			// TODO
			_mm256_store_si256((__m256i *)p,
				_mm256_or_si256(_mm256_and_si256(c, sub32),
				_mm256_andnot_si256(c, v)));
		}
	}

#   endif

	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
			_mm_maskmoveu_si128(sub, c, (char*)p);
	}

#endif

	// tail
	for (; n > 0; n--, p++)
		if (*p == val) *p = substitute;
}
示例#14
0
inline __m256i avx2_hexid_to_ringid(const __m256i hexid)
{
  const __m256i mask = _mm256_cmpeq_epi32(hexid, _mm256_setzero_si256());
  return _mm256_andnot_si256(mask, avx2_positive_hexid_to_ringid(hexid));
}
示例#15
0
inline __m256i avx2_uv_to_hexid_ccw(const __m256i u, const __m256i v)
{
  // if(u==0 and v==0)return 0;
  // int ringid = uv_to_ringid(u,v);
  // unsigned segid;
  // int runid;
  // int upv = u+v;
  // if(upv==ringid and v!=ringid)         { segid=0; runid=v; }
  // else if(v==ringid and u!=-ringid)     { segid=1; runid=-u; }
  // else if(u==-ringid and upv!=-ringid)  { segid=2; runid=ringid-v; }
  // else if(u+v==-ringid and v!=-ringid)  { segid=3; runid=-v; }
  // else if(v==-ringid and u!=ringid)     { segid=4; runid=u; }
  // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; }
  // return positive_ringid_segid_runid_to_hexid(ringid, segid, runid);
  const __m256i one = _mm256_set1_epi32(1);
  const __m256i minus_one = _mm256_set1_epi32(-1);
  const __m256i ringid = avx2_uv_to_ringid(u,v);
  const __m256i minus_ringid = _mm256_sign_epi32(ringid, minus_one);
  const __m256i upv = _mm256_add_epi32(u, v);
  __m256i not_found_mask = minus_one;
  __m256i hexid = avx2_ringid_to_nsites_contained(_mm256_sub_epi32(ringid, one));

  // Seg ID = 0
  // if(upv==ringid and v!=ringid)         { segid=0; runid=v; }
  __m256i here_mask = _mm256_cmpeq_epi32(upv, ringid);
  hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(ringid, v, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_add_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, v),
  //   _mm256_and_si256(not_found_mask, ringid)));

  // Seg ID = 1
  // else if(v==ringid and u!=-ringid)     { segid=1; runid=-u; }
  here_mask = _mm256_cmpeq_epi32(v, ringid);
  hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(minus_ringid, u, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, u),
  //   _mm256_and_si256(not_found_mask, minus_ringid)));

  // Seg ID = 2
  // else if(u==-ringid and upv!=-ringid)  { segid=2; runid=ringid-v; }
  here_mask = _mm256_cmpeq_epi32(u, minus_ringid);
  hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(minus_ringid, upv, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, upv),
  //   _mm256_and_si256(not_found_mask, minus_ringid)));

  // Seg ID = 3
  // else if(u+v==-ringid and v!=-ringid)  { segid=3; runid=-v; }
  here_mask = _mm256_cmpeq_epi32(upv, minus_ringid);
  hexid = _mm256_sub_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(minus_ringid, v, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_sub_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, v),
  //   _mm256_and_si256(not_found_mask, minus_ringid)));

  // Seg ID = 4
  // else if(v==-ringid and u!=ringid)     { segid=4; runid=u; }
  here_mask = _mm256_cmpeq_epi32(v, minus_ringid);
  hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask,
    _mm256_blendv_epi8(ringid, u, here_mask)));
  not_found_mask = _mm256_andnot_si256(here_mask, not_found_mask);
  // hexid = _mm256_add_epi32(hexid, _mm256_or_si256(
  //   _mm256_and_si256(here_mask, u),
  //   _mm256_and_si256(not_found_mask, ringid)));

  // Seg ID = 5
  // else /*if(u==ringid and upv!=ringid)*/{ segid=5; runid=ringid+v; }
  hexid = _mm256_add_epi32(hexid, _mm256_and_si256(not_found_mask, upv));

  const __m256i mask = _mm256_cmpeq_epi32(ringid, _mm256_setzero_si256());
  hexid = _mm256_andnot_si256(mask, hexid);
  return hexid;
}
//-----------------------------------------------------------------------------------------
// Rasterize the occludee AABB and depth test it against the CPU rasterized depth buffer
// If any of the rasterized AABB pixels passes the depth test exit early and mark the occludee
// as visible. If all rasterized AABB pixels are occluded then the occludee is culled
//-----------------------------------------------------------------------------------------
bool TransformedAABBoxAVX::RasterizeAndDepthTestAABBox(UINT *pRenderTargetPixels, const __m128 pXformedPos[], UINT idx)
{
	// Set DAZ and FZ MXCSR bits to flush denormals to zero (i.e., make it faster)
	// Denormal are zero (DAZ) is bit 6 and Flush to zero (FZ) is bit 15. 
	// so to enable the two to have to set bits 6 and 15 which 1000 0000 0100 0000 = 0x8040
	_mm_setcsr( _mm_getcsr() | 0x8040 );

	__m256i colOffset = _mm256_setr_epi32(0, 1, 2, 3, 0, 1, 2, 3);
	__m256i rowOffset = _mm256_setr_epi32(0, 0, 0, 0, 1, 1, 1, 1);
	float* pDepthBuffer = (float*)pRenderTargetPixels;
	
	// Rasterize the AABB triangles 4 at a time
	for(UINT i = 0; i < AABB_TRIANGLES; i += SSE)
	{
		vFloat4 xformedPos[3];
		Gather(xformedPos, i, pXformedPos, idx);

		// use fixed-point only for X and Y.  Avoid work for Z and W.
        __m128i fxPtX[3], fxPtY[3];
		for(int m = 0; m < 3; m++)
		{
			fxPtX[m] = _mm_cvtps_epi32(xformedPos[m].X);
			fxPtY[m] = _mm_cvtps_epi32(xformedPos[m].Y);
		}

		// Fab(x, y) =     Ax       +       By     +      C              = 0
		// Fab(x, y) = (ya - yb)x   +   (xb - xa)y + (xa * yb - xb * ya) = 0
		// Compute A = (ya - yb) for the 3 line segments that make up each triangle
		__m128i A0 = _mm_sub_epi32(fxPtY[1], fxPtY[2]);
		__m128i A1 = _mm_sub_epi32(fxPtY[2], fxPtY[0]);
		__m128i A2 = _mm_sub_epi32(fxPtY[0], fxPtY[1]);

		// Compute B = (xb - xa) for the 3 line segments that make up each triangle
		__m128i B0 = _mm_sub_epi32(fxPtX[2], fxPtX[1]);
		__m128i B1 = _mm_sub_epi32(fxPtX[0], fxPtX[2]);
		__m128i B2 = _mm_sub_epi32(fxPtX[1], fxPtX[0]);

		// Compute C = (xa * yb - xb * ya) for the 3 line segments that make up each triangle
		__m128i C0 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[1], fxPtY[2]), _mm_mullo_epi32(fxPtX[2], fxPtY[1]));
		__m128i C1 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[2], fxPtY[0]), _mm_mullo_epi32(fxPtX[0], fxPtY[2]));
		__m128i C2 = _mm_sub_epi32(_mm_mullo_epi32(fxPtX[0], fxPtY[1]), _mm_mullo_epi32(fxPtX[1], fxPtY[0]));

		// Compute triangle area
		__m128i triArea = _mm_mullo_epi32(B2, A1);
		triArea = _mm_sub_epi32(triArea, _mm_mullo_epi32(B1, A2));
		__m128 oneOverTriArea = _mm_rcp_ps(_mm_cvtepi32_ps(triArea));

		__m128 Z[3];
		Z[0] = xformedPos[0].Z;
		Z[1] = _mm_mul_ps(_mm_sub_ps(xformedPos[1].Z, Z[0]), oneOverTriArea);
		Z[2] = _mm_mul_ps(_mm_sub_ps(xformedPos[2].Z, Z[0]), oneOverTriArea);
		
		// Use bounding box traversal strategy to determine which pixels to rasterize 
		//__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i startX = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~3));
		__m128i endX = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtX[0], fxPtX[1]), fxPtX[2]), _mm_set1_epi32(SCREENW - 1));

		__m128i startY = _mm_and_si128(HelperSSE::Max(HelperSSE::Min(HelperSSE::Min(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(0)), _mm_set1_epi32(~1));
		__m128i endY = HelperSSE::Min(HelperSSE::Max(HelperSSE::Max(fxPtY[0], fxPtY[1]), fxPtY[2]), _mm_set1_epi32(SCREENH - 1));

		// Now we have 4 triangles set up.  Rasterize them each individually.
        for(int lane=0; lane < SSE; lane++)
        {
			// Skip triangle if area is zero 
			if(triArea.m128i_i32[lane] <= 0)
			{
				continue;
			}

			// Extract this triangle's properties from the SIMD versions
			__m256 zz[3];
			for (int vv = 0; vv < 3; vv++)
			{
				zz[vv] = _mm256_set1_ps(Z[vv].m128_f32[lane]);
			}

			int startXx = startX.m128i_i32[lane];
			int endXx = endX.m128i_i32[lane];
			int startYy = startY.m128i_i32[lane];
			int endYy = endY.m128i_i32[lane];

			__m256i aa0 = _mm256_set1_epi32(A0.m128i_i32[lane]);
			__m256i aa1 = _mm256_set1_epi32(A1.m128i_i32[lane]);
			__m256i aa2 = _mm256_set1_epi32(A2.m128i_i32[lane]);

			__m256i bb0 = _mm256_set1_epi32(B0.m128i_i32[lane]);
			__m256i bb1 = _mm256_set1_epi32(B1.m128i_i32[lane]);
			__m256i bb2 = _mm256_set1_epi32(B2.m128i_i32[lane]);

			__m256i aa0Inc = _mm256_slli_epi32(aa0, 2);
			__m256i aa1Inc = _mm256_slli_epi32(aa1, 2);
			__m256i aa2Inc = _mm256_slli_epi32(aa2, 2);

			__m256i bb0Inc = _mm256_slli_epi32(bb0, 1);
			__m256i bb1Inc = _mm256_slli_epi32(bb1, 1);
			__m256i bb2Inc = _mm256_slli_epi32(bb2, 1);

			__m256i row, col;

			// Traverse pixels in 2x4 blocks and store 2x4 pixel quad depths contiguously in memory ==> 2*X
			// This method provides better performance
			int	rowIdx = (startYy * SCREENW + 2 * startXx);

			col = _mm256_add_epi32(colOffset, _mm256_set1_epi32(startXx));
			__m256i aa0Col = _mm256_mullo_epi32(aa0, col);
			__m256i aa1Col = _mm256_mullo_epi32(aa1, col);
			__m256i aa2Col = _mm256_mullo_epi32(aa2, col);

			row = _mm256_add_epi32(rowOffset, _mm256_set1_epi32(startYy));
			__m256i bb0Row = _mm256_add_epi32(_mm256_mullo_epi32(bb0, row), _mm256_set1_epi32(C0.m128i_i32[lane]));
			__m256i bb1Row = _mm256_add_epi32(_mm256_mullo_epi32(bb1, row), _mm256_set1_epi32(C1.m128i_i32[lane]));
			__m256i bb2Row = _mm256_add_epi32(_mm256_mullo_epi32(bb2, row), _mm256_set1_epi32(C2.m128i_i32[lane]));

			__m256i sum0Row = _mm256_add_epi32(aa0Col, bb0Row);
			__m256i sum1Row = _mm256_add_epi32(aa1Col, bb1Row);
			__m256i sum2Row = _mm256_add_epi32(aa2Col, bb2Row);

			__m256 zx = _mm256_mul_ps(_mm256_cvtepi32_ps(aa1Inc), zz[1]);
			zx = _mm256_add_ps(zx, _mm256_mul_ps(_mm256_cvtepi32_ps(aa2Inc), zz[2]));

			// Incrementally compute Fab(x, y) for all the pixels inside the bounding box formed by (startX, endX) and (startY, endY)
			for (int r = startYy; r < endYy; r += 2,
				rowIdx += 2 * SCREENW,
				sum0Row = _mm256_add_epi32(sum0Row, bb0Inc),
				sum1Row = _mm256_add_epi32(sum1Row, bb1Inc),
				sum2Row = _mm256_add_epi32(sum2Row, bb2Inc))
			{
				// Compute barycentric coordinates 
				int index = rowIdx;
				__m256i alpha = sum0Row;
				__m256i beta = sum1Row;
				__m256i gama = sum2Row;

				//Compute barycentric-interpolated depth
				__m256 depth = zz[0];
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(beta), zz[1]));
				depth = _mm256_add_ps(depth, _mm256_mul_ps(_mm256_cvtepi32_ps(gama), zz[2]));
				__m256i anyOut = _mm256_setzero_si256();

				for (int c = startXx; c < endXx; c += 4,
					index += 8,
					alpha = _mm256_add_epi32(alpha, aa0Inc),
					beta = _mm256_add_epi32(beta, aa1Inc),
					gama = _mm256_add_epi32(gama, aa2Inc),
					depth = _mm256_add_ps(depth, zx))
				{
					//Test Pixel inside triangle
					__m256i mask = _mm256_or_si256(_mm256_or_si256(alpha, beta), gama);

					__m256 previousDepthValue = _mm256_loadu_ps(&pDepthBuffer[index]);
					__m256 depthMask = _mm256_cmp_ps(depth, previousDepthValue, 0x1D);
					__m256i finalMask = _mm256_andnot_si256(mask, _mm256_castps_si256(depthMask));
					anyOut = _mm256_or_si256(anyOut, finalMask);
				}//for each column	

				if (!_mm256_testz_si256(anyOut, _mm256_set1_epi32(0x80000000)))
				{
					return true; //early exit
				}
			}// for each row
		}// for each triangle
	}// for each set of SIMD# triangles

	return false;
}