Exemplo n.º 1
0
inline avx_m256_t newsin_ps(avx_m256_t x) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);
	
	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	emm2 = _mm256_add_epi32(emm2, _pi32_1);
	emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	y = _mm256_cvtepi32_ps(emm2);

	avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	emm0 = _mm256_slli_epi32(emm0, 29);

	emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	
	avx_m256_t swap_sign_bit = _mm256_castsi256_ps(emm0);
	avx_m256_t poly_mask = _mm256_castsi256_ps(emm2);
	sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);
	
	avx_m256_t temp = _ps_minus_cephes_DP123;
	temp = _mm256_mul_ps(y, temp);
	x = _mm256_add_ps(x, temp);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp = _mm256_mul_ps(x2, _ps_0p5);
	temp = _mm256_sub_ps(temp, _ps_1);
	y = _mm256_sub_ps(y, temp);
	y2 = _mm256_add_ps(y2, x);

	y = _mm256_andnot_ps(poly_mask, y);
	y2 = _mm256_and_ps(poly_mask, y2);
	y = _mm256_add_ps(y, y2);

	y = _mm256_xor_ps(y, sign_bit);

	return y;
} // newsin_ps()
Exemplo n.º 2
0
// Compare rank with all values currently in the queue.  Returns -1 if the value already exists
// or is larger than all values.
// Otherwise, returns the index of the register in which the value should be inserted.
// Mask is replicated to both lanes, so it can be used for both value and rank lane.
int PriorityQueue_AVX2::compare(__m256i mrank, int &field, __m256i &gtmask)
{
    static const __m256i eq4mask = _mm256_set_epi32(0, 0, 0, 0, -1, -1, -1, -1);
    __m256i eq, eq4;
    int reg, mask;

    // Because items are sorted in ascending order within each (double) register, the mask after GT
    // comparison must be of the form 000...1111, which is one less than a power of two.
    {
        __m256i r0_7 = _mm256_permute2x128_si256(_rv[1], _rv[0], 0x20);		// [0 .. 7]
        gtmask = _mm256_cmpgt_epi32(r0_7, mrank);
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask));
        eq = _mm256_cmpeq_epi32(r0_7, mrank);
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 1;
    }

    if (!mask) {
        __m256i r8_15 = _mm256_permute2x128_si256(_rv[3], _rv[2], 0x20);	// [8 .. 15]
        gtmask = _mm256_cmpgt_epi32(r8_15, mrank);
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask));
        eq = _mm256_or_si256(eq, _mm256_cmpeq_epi32(r8_15, mrank));
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 3;
    }

    if (!mask) {
        gtmask = _mm256_cmpgt_epi32(_rv[4], mrank);							// [16 .. 19]; don't care about value
        eq4 = _mm256_and_si256(eq4mask, _mm256_cmpeq_epi32(mrank, _rv[4])); // .. ditto
        mask = _mm256_movemask_ps(_mm256_castsi256_ps(gtmask)) & 0xF;       // ignore comparison with values
        eq = _mm256_or_si256(eq, eq4);
        _ASSERTE(((mask + 1) & mask) == 0);
        reg = 4;
    }

    if (_mm256_movemask_ps(_mm256_castsi256_ps(eq)) != 0)
        mask = 0;
    if (!mask)
        return -1;

    // Adjust register according to mask (higher 128-bits i double register: one register lower)
    // There is no "previous" register to test against for equality if we need to insert in the
    // very first register.  Also duplicate the same mask to both lanes.

    if (mask > 0xF) {
        mask >>= 4;
        --reg;
        gtmask = _mm256_permute2x128_si256(gtmask, gtmask, 0x11);           // replicate high lane to both
    }
Exemplo n.º 3
0
__m256 mm256_exp_ps(__m256 x) {
  __m256 tmp = _mm256_setzero_ps(), fx;
  __m256i emm0;
  __m256 one = *(__m256*)m256_ps_1;

  x = _mm256_min_ps(x, *(__m256*)m256_ps_exp_hi);
  x = _mm256_max_ps(x, *(__m256*)m256_ps_exp_lo);

  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_LOG2EF);
  fx = _mm256_add_ps(fx, *(__m256*)m256_ps_0p5);

  /* how to perform a floorf with SSE: just below */
  /* step 1 : cast to int */
  emm0 = _mm256_cvttps_epi32(fx);
  /* step 2 : cast back to float */
  tmp  = _mm256_cvtepi32_ps(emm0);

  /* if greater, substract 1 */
  __m256 mask = _mm256_cmp_ps( tmp, fx, _CMP_GT_OS );
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

  tmp = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C1);
  __m256 z = _mm256_mul_ps(fx, *(__m256*)m256_ps_cephes_exp_C2);
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x,x);
  
  __m256 y = *(__m256*)m256_ps_cephes_exp_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_cephes_exp_p5);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);

  /* build 2^n */
  emm0 = _mm256_cvttps_epi32(fx);
  emm0 = _mm256_add_epi32(emm0, *(__m256i*)m256_pi32_0x7f);
  emm0 = _mm256_slli_epi32(emm0, 23);
  __m256 pow2n = _mm256_castsi256_ps(emm0);

  y = _mm256_mul_ps(y, pow2n);
  _mm256_zeroupper();
  return y;
}
Exemplo n.º 4
0
v8sf exp256_ps(v8sf x) {
  v8sf tmp = _mm256_setzero_ps(), fx;
  v8si imm0;
  v8sf one = *(v8sf*)_ps256_1;

  x = _mm256_min_ps(x, *(v8sf*)_ps256_exp_hi);
  x = _mm256_max_ps(x, *(v8sf*)_ps256_exp_lo);

  /* express exp(x) as exp(g + n*log(2)) */
  fx = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_LOG2EF);
  fx = _mm256_add_ps(fx, *(v8sf*)_ps256_0p5);

  /* how to perform a floorf with SSE: just below */
  //imm0 = _mm256_cvttps_epi32(fx);
  //tmp  = _mm256_cvtepi32_ps(imm0);
  
  tmp = _mm256_floor_ps(fx);

  /* if greater, substract 1 */
  //v8sf mask = _mm256_cmpgt_ps(tmp, fx);    
  v8sf mask = _mm256_cmp_ps(tmp, fx, _CMP_GT_OS);    
  mask = _mm256_and_ps(mask, one);
  fx = _mm256_sub_ps(tmp, mask);

  tmp = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C1);
  v8sf z = _mm256_mul_ps(fx, *(v8sf*)_ps256_cephes_exp_C2);
  x = _mm256_sub_ps(x, tmp);
  x = _mm256_sub_ps(x, z);

  z = _mm256_mul_ps(x,x);
  
  v8sf y = *(v8sf*)_ps256_cephes_exp_p0;
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p1);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p2);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p3);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p4);
  y = _mm256_mul_ps(y, x);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_cephes_exp_p5);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, x);
  y = _mm256_add_ps(y, one);

  /* build 2^n */
  imm0 = _mm256_cvttps_epi32(fx);
  // another two AVX2 instructions
  imm0 = _mm256_add_epi32(imm0, *(v8si*)_pi32_256_0x7f);
  imm0 = _mm256_slli_epi32(imm0, 23);
  v8sf pow2n = _mm256_castsi256_ps(imm0);
  y = _mm256_mul_ps(y, pow2n);
  return y;
}
    __m256 _inner_mm256_exp_ps1(__m256 arg)
    {
        arg = _mm256_mul_ps(arg, _mm256_set1_ps(1.4426950408889634073599246810018921374266459541529859f));

        __m256i e = _mm256_add_epi32(
            _mm256_castps_si256(_mm256_cmp_ps(arg, _mm256_set1_ps(0.0f), _CMP_LT_OQ)),
            _mm256_cvttps_epi32(arg));

        arg = _mm256_sub_ps(arg, _mm256_cvtepi32_ps(e));

        __m256 intermediate_result;
        intermediate_result = _mm256_fmadd_ps(_mm256_set1_ps(0.0136779459179717f), arg, _mm256_set1_ps(0.0517692205767896f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.241554388295527f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.692998430056128f));
        intermediate_result = _mm256_fmadd_ps(intermediate_result, arg, _mm256_set1_ps(0.999999804292074f));
        arg = intermediate_result;

        __m256 res = _mm256_castsi256_ps(_mm256_slli_epi32(_mm256_add_epi32(e, _mm256_set1_epi32(127)), 23));

        res = _mm256_mul_ps(res, arg);

        return res;
    }
Exemplo n.º 6
0
inline void newsincos_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t *s1, avx_m256_t *s2,
						avx_m256_t *c1, avx_m256_t *c2) {
	avx_m256_t tempa = _ps_sign_mask;
	avx_m256_t tempb = _ps_inv_sign_mask;
	avx_m256_t sign_bit1 = _mm256_and_ps(x1, tempa);
	avx_m256_t sign_bit2 = _mm256_and_ps(x2, tempa);
	x1 = _mm256_and_ps(x1, tempb);
	x2 = _mm256_and_ps(x2, tempb);

	tempa = _ps_cephes_FOPI;
	avx_m256_t y1 = _mm256_mul_ps(x1, tempa);
	avx_m256_t y2 = _mm256_mul_ps(x2, tempa);

	//avx_m256i_t emm21 = _mm256_cvttps_epi32(y1);
	//avx_m256i_t emm22 = _mm256_cvttps_epi32(y2);
	//emm21 = _mm256_add_epi32(emm21, _pi32_1);
	//emm22 = _mm256_add_epi32(emm22, _pi32_1);
	avx_m256i_t emm21 = _mm256_cvttps_epi32(_mm256_add_ps(y1, _ps_1));
	avx_m256i_t emm22 = _mm256_cvttps_epi32(_mm256_add_ps(y2, _ps_1));

	//emm21 = _mm256_and_si256(emm21, _pi32_inv1);
	//emm22 = _mm256_and_si256(emm22, _pi32_inv1);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21), _mm256_castsi256_ps(_pi32_inv1)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22), _mm256_castsi256_ps(_pi32_inv1)));

	y1 = _mm256_cvtepi32_ps(emm21);
	y2 = _mm256_cvtepi32_ps(emm22);

	//avx_m256i_t tempia = _pi32_2;
	//avx_m256i_t cos_emm21 = _mm256_sub_epi32(emm21, tempia);
	//avx_m256i_t cos_emm22 = _mm256_sub_epi32(emm22, tempia);
	avx_m256i_t cos_emm21 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm21), _ps_2));
	avx_m256i_t cos_emm22 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm22), _ps_2));

	//avx_m256i_t tempib = _pi32_4;
	//avx_m256i_t emm01 = _mm256_and_si256(emm21, tempib);
	//avx_m256i_t emm02 = _mm256_and_si256(emm22, tempib);
	avx_m256i_t emm01 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t emm02 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm01 = _mm256_andnot_si256(cos_emm21, tempib);
	//avx_m256i_t cos_emm02 = _mm256_andnot_si256(cos_emm22, tempib);
	avx_m256i_t cos_emm01 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_4)));
	avx_m256i_t cos_emm02 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_4)));

	//emm01 = _mm256_slli_epi32(emm01, 29);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 29);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 29);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 29);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 29);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 29);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi1, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo1, 1);

	//cos_emm01 = _mm256_slli_epi32(cos_emm01, 29);
	__m128i cos_emm0hi1 = _mm256_extractf128_si256(cos_emm01, 0);
	__m128i cos_emm0lo1 = _mm256_extractf128_si256(cos_emm01, 1);
	cos_emm0hi1 = _mm_slli_epi32(cos_emm0hi1, 29);
	cos_emm0lo1 = _mm_slli_epi32(cos_emm0lo1, 29);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0hi1, 0);
	cos_emm01 = _mm256_insertf128_si256(cos_emm01, cos_emm0lo1, 1);

	//cos_emm02 = _mm256_slli_epi32(cos_emm02, 29);
	__m128i cos_emm0hi2 = _mm256_extractf128_si256(cos_emm02, 0);
	__m128i cos_emm0lo2 = _mm256_extractf128_si256(cos_emm02, 1);
	cos_emm0hi2 = _mm_slli_epi32(cos_emm0hi2, 29);
	cos_emm0lo2 = _mm_slli_epi32(cos_emm0lo2, 29);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0hi2, 0);
	cos_emm02 = _mm256_insertf128_si256(cos_emm02, cos_emm0lo2, 1);

	//tempia = _pi32_2;
	//tempib = _mm256_setzero_si256();
	//emm21 = _mm256_and_si256(emm21, tempia);
	//emm22 = _mm256_and_si256(emm22, tempia);
	emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm21),
											_mm256_castsi256_ps(_pi32_2)));
	emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm21 = _mm256_and_si256(cos_emm21, tempia);
	//cos_emm22 = _mm256_and_si256(cos_emm22, tempia);
	cos_emm21 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm21),
											_mm256_castsi256_ps(_pi32_2)));
	cos_emm22 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm22),
											_mm256_castsi256_ps(_pi32_2)));

	//emm21 = _mm256_cmpeq_epi32(emm21, tempib);
	//emm22 = _mm256_cmpeq_epi32(emm22, tempib);
	emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm21 = _mm256_cmpeq_epi32(cos_emm21, tempib);
	//cos_emm22 = _mm256_cmpeq_epi32(cos_emm22, tempib);
	cos_emm21 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm21), _mm256_setzero_ps(), _CMP_EQ_UQ));
	cos_emm22 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm22), _mm256_setzero_ps(), _CMP_EQ_UQ));
	
	avx_m256_t emm0f1 = _mm256_castsi256_ps(emm01);
	avx_m256_t emm0f2 = _mm256_castsi256_ps(emm02);
	avx_m256_t emm2f1 = _mm256_castsi256_ps(emm21);
	avx_m256_t emm2f2 = _mm256_castsi256_ps(emm22);
	avx_m256_t cos_emm0f1 = _mm256_castsi256_ps(cos_emm01);
	avx_m256_t cos_emm0f2 = _mm256_castsi256_ps(cos_emm02);
	avx_m256_t cos_emm2f1 = _mm256_castsi256_ps(cos_emm21);
	avx_m256_t cos_emm2f2 = _mm256_castsi256_ps(cos_emm22);

	sign_bit1 = _mm256_xor_ps(sign_bit1, emm0f1);
	sign_bit2 = _mm256_xor_ps(sign_bit2, emm0f2);

	tempa = _ps_minus_cephes_DP123;
	tempb = _mm256_mul_ps(y2, tempa);
	tempa = _mm256_mul_ps(y1, tempa);
	x2 = _mm256_add_ps(x2, tempb);
	x1 = _mm256_add_ps(x1, tempa);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);

	tempa = _ps_coscof_p0;
	tempb = _ps_sincof_p0;

	y1 = _mm256_mul_ps(x21, tempa);
	y2 = _mm256_mul_ps(x22, tempa);
	avx_m256_t y21 = _mm256_mul_ps(x21, tempb);
	avx_m256_t y22 = _mm256_mul_ps(x22, tempb);
	tempa = _ps_coscof_p1;
	tempb = _ps_sincof_p1;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x21);
	y2 = _mm256_mul_ps(y2, x22);
	y21 = _mm256_mul_ps(y21, x21);
	y22 = _mm256_mul_ps(y22, x22);
	tempa = _ps_coscof_p2;
	tempb = _ps_sincof_p2;
	y1 = _mm256_add_ps(y1, tempa);
	y2 = _mm256_add_ps(y2, tempa);
	y21 = _mm256_add_ps(y21, tempb);
	y22 = _mm256_add_ps(y22, tempb);
	y1 = _mm256_mul_ps(y1, x41);
	y2 = _mm256_mul_ps(y2, x42);
	y21 = _mm256_mul_ps(y21, x31);
	y22 = _mm256_mul_ps(y22, x32);
	tempa = _ps_0p5;
	tempb = _ps_1;
	avx_m256_t temp_21 = _mm256_mul_ps(x21, tempa);
	avx_m256_t temp_22 = _mm256_mul_ps(x22, tempa);
	y21 = _mm256_add_ps(y21, x1);
	y22 = _mm256_add_ps(y22, x2);
	temp_21 = _mm256_sub_ps(temp_21, tempb);
	temp_22 = _mm256_sub_ps(temp_22, tempb);
	y1 = _mm256_sub_ps(y1, temp_21);
	y2 = _mm256_sub_ps(y2, temp_22);

	avx_m256_t cos_y1 = y1;
	avx_m256_t cos_y2 = y2;
	avx_m256_t cos_y21 = y21;
	avx_m256_t cos_y22 = y22;
	y1 = _mm256_andnot_ps(emm2f1, y1);
	y2 = _mm256_andnot_ps(emm2f2, y2);
	cos_y1 = _mm256_andnot_ps(cos_emm2f1, cos_y1);
	cos_y2 = _mm256_andnot_ps(cos_emm2f2, cos_y2);
	y21 = _mm256_and_ps(emm2f1, y21);
	y22 = _mm256_and_ps(emm2f2, y22);
	cos_y21 = _mm256_and_ps(cos_emm2f1, cos_y21);
	cos_y22 = _mm256_and_ps(cos_emm2f2, cos_y22);
	y1 = _mm256_add_ps(y1, y21);
	y2 = _mm256_add_ps(y2, y22);
	cos_y1 = _mm256_add_ps(cos_y1, cos_y21);
	cos_y2 = _mm256_add_ps(cos_y2, cos_y22);

	*s1 = _mm256_xor_ps(y1, sign_bit1);
	*s2 = _mm256_xor_ps(y2, sign_bit2);
	*c1 = _mm256_xor_ps(cos_y1, cos_emm0f1);
	*c2 = _mm256_xor_ps(cos_y2, cos_emm0f2);
} // newsincos_ps_dual()
Exemplo n.º 7
0
/* since sin256_ps and cos256_ps are almost identical, sincos256_ps could replace both of them..
   it is almost as fast, and gives you a free cosine with your sine */
void sincos256_ps(v8sf x, v8sf *s, v8sf *c) {

  v8sf xmm1, xmm2, xmm3 = _mm256_setzero_ps(), sign_bit_sin, y;
  v8si imm0, imm2, imm4;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
  v4si imm4_1, imm4_2;
#endif

  sign_bit_sin = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit_sin = _mm256_and_ps(sign_bit_sin, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

#ifdef __AVX2__    
  /* store the integer part of y in imm2 */
  imm2 = _mm256_cvttps_epi32(y);

  /* j=(j+1) & (~1) (see the cephes sources) */
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);

  y = _mm256_cvtepi32_ps(imm2);
  imm4 = imm2;

  /* get the swap sign flag for the sine */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  //v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);

  /* get the polynom selection mask for the sine*/
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2, *(v8si*)_pi32_256_0);
  //v8sf poly_mask = _mm256_castsi256_ps(imm2);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);
  
  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm4_1 = imm2_1;
  imm4_2 = imm2_2;

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);
  
  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
  v8sf swap_sign_bit_sin = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

#ifdef __AVX2__
  imm4 = _mm256_sub_epi32(imm4, *(v8si*)_pi32_256_2);
  imm4 = _mm256_andnot_si128(imm4, *(v8si*)_pi32_256_4);
  imm4 = _mm256_slli_epi32(imm4, 29);
#else
  imm4_1 = _mm_sub_epi32(imm4_1, *(v4si*)_pi32avx_2);
  imm4_2 = _mm_sub_epi32(imm4_2, *(v4si*)_pi32avx_2);

  imm4_1 = _mm_andnot_si128(imm4_1, *(v4si*)_pi32avx_4);
  imm4_2 = _mm_andnot_si128(imm4_2, *(v4si*)_pi32avx_4);
  
  imm4_1 = _mm_slli_epi32(imm4_1, 29);
  imm4_2 = _mm_slli_epi32(imm4_2, 29);

  COPY_XMM_TO_IMM(imm4_1, imm4_2, imm4);
#endif

  v8sf sign_bit_cos = _mm256_castsi256_ps(imm4);

  sign_bit_sin = _mm256_xor_ps(sign_bit_sin, swap_sign_bit_sin);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  v8sf z = _mm256_mul_ps(x,x);
  y = *(v8sf*)_ps256_coscof_p0;

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  v8sf ysin2 = _mm256_and_ps(xmm3, y2);
  v8sf ysin1 = _mm256_andnot_ps(xmm3, y);
  y2 = _mm256_sub_ps(y2,ysin2);
  y = _mm256_sub_ps(y, ysin1);

  xmm1 = _mm256_add_ps(ysin1,ysin2);
  xmm2 = _mm256_add_ps(y,y2);
 
  /* update the sign */
  *s = _mm256_xor_ps(xmm1, sign_bit_sin);
  *c = _mm256_xor_ps(xmm2, sign_bit_cos);
}
Exemplo n.º 8
0
inline void newsincos_ps(avx_m256_t x, avx_m256_t *s, avx_m256_t *c) {
	avx_m256_t sign_bit = _mm256_and_ps(x, _ps_sign_mask);
	x = _mm256_and_ps(x, _ps_inv_sign_mask);

	avx_m256_t y = _mm256_mul_ps(x, _ps_cephes_FOPI);

	//avx_m256i_t emm2 = _mm256_cvttps_epi32(y);
	//emm2 = _mm256_add_epi32(emm2, _pi32_1);
	avx_m256i_t emm2 = _mm256_cvttps_epi32(_mm256_add_ps(y, _ps_1));

	//emm2 = _mm256_and_si256(emm2, _pi32_inv1);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2), _mm256_castsi256_ps(_pi32_inv1)));

	y = _mm256_cvtepi32_ps(emm2);

	//avx_m256i_t cos_emm2 = _mm256_sub_epi32(emm2, _pi32_2);
	avx_m256i_t cos_emm2 = _mm256_cvtps_epi32(_mm256_sub_ps(_mm256_cvtepi32_ps(emm2), _ps_2));

	//avx_m256i_t emm0 = _mm256_and_si256(emm2, _pi32_4);
	avx_m256i_t emm0 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//avx_m256i_t cos_emm0 = _mm256_andnot_si256(cos_emm2, _pi32_4);
	avx_m256i_t cos_emm0 = _mm256_castps_si256(_mm256_andnot_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_4)));

	//emm0 = _mm256_slli_epi32(emm0, 29);
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 29);
	emm0lo = _mm_slli_epi32(emm0lo, 29);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	//cos_emm0 = _mm256_slli_epi32(cos_emm0, 29);
	__m128i cos_emm0hi = _mm256_extractf128_si256(cos_emm0, 0);
	__m128i cos_emm0lo = _mm256_extractf128_si256(cos_emm0, 1);
	cos_emm0hi = _mm_slli_epi32(cos_emm0hi, 29);
	cos_emm0lo = _mm_slli_epi32(cos_emm0lo, 29);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0hi, 0);
	cos_emm0 = _mm256_insertf128_si256(cos_emm0, cos_emm0lo, 1);

	//emm2 = _mm256_and_si256(emm2, _pi32_2);
	emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//cos_emm2 = _mm256_and_si256(cos_emm2, _pi32_2);
	cos_emm2 = _mm256_castps_si256(_mm256_and_ps(_mm256_castsi256_ps(cos_emm2),
											_mm256_castsi256_ps(_pi32_2)));

	//emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
	emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	//cos_emm2 = _mm256_cmpeq_epi32(cos_emm2, _mm256_setzero_si256());
	cos_emm2 = _mm256_castps_si256(_mm256_cmp_ps(_mm256_castsi256_ps(cos_emm2), _mm256_setzero_ps(), _CMP_EQ_UQ));

	avx_m256_t emm0f = _mm256_castsi256_ps(emm0);
	avx_m256_t emm2f = _mm256_castsi256_ps(emm2);
	avx_m256_t cos_emm0f = _mm256_castsi256_ps(cos_emm0);
	avx_m256_t cos_emm2f = _mm256_castsi256_ps(cos_emm2);

	sign_bit = _mm256_xor_ps(sign_bit, emm0f);

	avx_m256_t temp_2 = _ps_minus_cephes_DP123;
	temp_2 = _mm256_mul_ps(y, temp_2);
	x = _mm256_add_ps(x, temp_2);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);

	y = _ps_coscof_p0;
	avx_m256_t y2 = _ps_sincof_p0;
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p1);
	y2 = _mm256_add_ps(y2, _ps_sincof_p1);
	y = _mm256_mul_ps(y, x2);
	y2 = _mm256_mul_ps(y2, x2);
	y = _mm256_add_ps(y, _ps_coscof_p2);
	y2 = _mm256_add_ps(y2, _ps_sincof_p2);
	y = _mm256_mul_ps(y, x4);
	y2 = _mm256_mul_ps(y2, x3);
	temp_2 = _mm256_mul_ps(x2, _ps_0p5);
	y2 = _mm256_add_ps(y2, x);
	temp_2 = _mm256_sub_ps(temp_2, _ps_1);
	y = _mm256_sub_ps(y, temp_2);

	avx_m256_t cos_y = y;
	avx_m256_t cos_y2 = y2;
	y = _mm256_andnot_ps(emm2f, y);
	cos_y = _mm256_andnot_ps(cos_emm2f, cos_y);
	y2 = _mm256_and_ps(emm2f, y2);
	cos_y2 = _mm256_and_ps(cos_emm2f, cos_y2);
	y = _mm256_add_ps(y, y2);
	cos_y = _mm256_add_ps(cos_y, cos_y2);

	*s = _mm256_xor_ps(y, sign_bit);
	*c = _mm256_xor_ps(cos_y, cos_emm0f);
} // newsincos_ps()
Exemplo n.º 9
0
 INLINE const avxi operator  ^( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_xor_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
Exemplo n.º 10
0
inline void newexp_ps_dual(avx_m256_t x1, avx_m256_t x2, avx_m256_t* exp1, avx_m256_t* exp2) {
	avx_m256_t one = _ps_1;
	avx_m256_t zero = _ps_0;

	x1 = _mm256_min_ps(x1, _ps_exp_hi);
	x2 = _mm256_min_ps(x2, _ps_exp_hi);
	x1 = _mm256_max_ps(x1, _ps_exp_lo);
	x2 = _mm256_max_ps(x2, _ps_exp_lo);

	avx_m256_t temp_21 = _mm256_mul_ps(x1, _ps_cephes_LOG2EF);
	avx_m256_t temp_22 = _mm256_mul_ps(x2, _ps_cephes_LOG2EF);
	temp_21 = _mm256_add_ps(temp_21, _ps_0p5);
	temp_22 = _mm256_add_ps(temp_22, _ps_0p5);

	avx_m256i_t emm01 = _mm256_cvttps_epi32(temp_21);
	avx_m256i_t emm02 = _mm256_cvttps_epi32(temp_22);
	avx_m256_t temp_11 = _mm256_cvtepi32_ps(emm01);
	avx_m256_t temp_12 = _mm256_cvtepi32_ps(emm02);
	avx_m256_t temp_31 = _mm256_sub_ps(temp_11, temp_21);
	avx_m256_t temp_32 = _mm256_sub_ps(temp_12, temp_22);
	avx_m256_t mask1 = _mm256_cmp_ps(temp_31, zero, _CMP_GT_OQ);
	avx_m256_t mask2 = _mm256_cmp_ps(temp_32, zero, _CMP_GT_OQ);

	mask1 = _mm256_and_ps(mask1, one);
	mask2 = _mm256_and_ps(mask2, one);
	temp_21 = _mm256_sub_ps(temp_11, mask1);
	temp_22 = _mm256_sub_ps(temp_12, mask2);
	emm01 = _mm256_cvttps_epi32(temp_21);
	emm02 = _mm256_cvttps_epi32(temp_22);

	temp_11 = _mm256_mul_ps(temp_21, _ps_cephes_exp_C12);
	temp_12 = _mm256_mul_ps(temp_22, _ps_cephes_exp_C12);
	x1 = _mm256_sub_ps(x1, temp_11);
	x2 = _mm256_sub_ps(x2, temp_12);

	avx_m256_t x21 = _mm256_mul_ps(x1, x1);
	avx_m256_t x22 = _mm256_mul_ps(x2, x2);
	avx_m256_t x31 = _mm256_mul_ps(x21, x1);
	avx_m256_t x32 = _mm256_mul_ps(x22, x2);
	avx_m256_t x41 = _mm256_mul_ps(x21, x21);
	avx_m256_t x42 = _mm256_mul_ps(x22, x22);
 
	temp_11 = _mm256_add_ps(x1, one);
	temp_12 = _mm256_add_ps(x2, one);
	temp_21 = _mm256_mul_ps(x21, _ps_cephes_exp_p5);
	temp_22 = _mm256_mul_ps(x22, _ps_cephes_exp_p5);
	temp_31 = _mm256_mul_ps(x31, _ps_cephes_exp_p4);
	temp_32 = _mm256_mul_ps(x32, _ps_cephes_exp_p4);
	temp_11 = _mm256_add_ps(temp_11, temp_21);
	temp_12 = _mm256_add_ps(temp_12, temp_22);

	temp_21 = _mm256_mul_ps(x31, _ps_cephes_exp_p0);
	temp_22 = _mm256_mul_ps(x32, _ps_cephes_exp_p0);

	temp_11 = _mm256_add_ps(temp_11, temp_31);
	temp_12 = _mm256_add_ps(temp_12, temp_32);

	avx_m256_t temp_41 = _mm256_mul_ps(x1, _ps_cephes_exp_p2);
	avx_m256_t temp_42 = _mm256_mul_ps(x2, _ps_cephes_exp_p2);
	temp_31 = _mm256_mul_ps(x21, _ps_cephes_exp_p1);
	temp_32 = _mm256_mul_ps(x22, _ps_cephes_exp_p1);

	//emm01 = _mm256_add_epi32(emm01, _pi32_0x7f);
	//emm02 = _mm256_add_epi32(emm02, _pi32_0x7f);
	emm01 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm01), _mm256_castsi256_ps(_pi32_0x7f)));
	emm02 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm02), _mm256_castsi256_ps(_pi32_0x7f)));

	temp_21 = _mm256_add_ps(temp_21, temp_31);
	temp_22 = _mm256_add_ps(temp_22, temp_32);
	temp_31 = _mm256_add_ps(temp_31, temp_41);
	temp_32 = _mm256_add_ps(temp_32, temp_42);

	//emm01 = _mm256_slli_epi32(emm01, 23);
	__m128i emm0hi1 = _mm256_extractf128_si256(emm01, 0);
	__m128i emm0lo1 = _mm256_extractf128_si256(emm01, 1);
	emm0hi1 = _mm_slli_epi32(emm0hi1, 23);
	emm0lo1 = _mm_slli_epi32(emm0lo1, 23);
	emm01 = _mm256_insertf128_si256(emm01, emm0hi1, 0);
	emm01 = _mm256_insertf128_si256(emm01, emm0lo1, 1);

	//emm02 = _mm256_slli_epi32(emm02, 23);
	__m128i emm0hi2 = _mm256_extractf128_si256(emm02, 0);
	__m128i emm0lo2 = _mm256_extractf128_si256(emm02, 1);
	emm0hi2 = _mm_slli_epi32(emm0hi2, 23);
	emm0lo2 = _mm_slli_epi32(emm0lo2, 23);
	emm02 = _mm256_insertf128_si256(emm02, emm0hi2, 0);
	emm02 = _mm256_insertf128_si256(emm02, emm0lo2, 1);

	avx_m256_t pow2n1 = _mm256_castsi256_ps(emm01);
	avx_m256_t pow2n2 = _mm256_castsi256_ps(emm02);

	temp_21 = _mm256_add_ps(temp_21, temp_31);
	temp_22 = _mm256_add_ps(temp_22, temp_32);
	temp_21 = _mm256_mul_ps(temp_21, x41);
	temp_22 = _mm256_mul_ps(temp_22, x42);

	avx_m256_t y1 = _mm256_add_ps(temp_11, temp_21);
	avx_m256_t y2 = _mm256_add_ps(temp_12, temp_22);

	*exp1 = _mm256_mul_ps(y1, pow2n1);
	*exp2 = _mm256_mul_ps(y2, pow2n2);
} // newexp_ps_dual()
Exemplo n.º 11
0
void	TransLut_FindIndexAvx2 <TransLut::MapperLog>::find_index (const TransLut::FloatIntMix val_arr [8], __m256i &index, __m256 &frac)
{
	assert (val_arr != 0);

	// Constants
	static const int      mant_size = 23;
	static const int      exp_bias  = 127;
	static const uint32_t base      = (exp_bias + LOGLUT_MIN_L2) << mant_size;
	static const float    val_min   = 1.0f / (int64_t (1) << -LOGLUT_MIN_L2);
//	static const float    val_max   = float (int64_t (1) << LOGLUT_MAX_L2);
	static const int      frac_size = mant_size - LOGLUT_RES_L2;
	static const uint32_t frac_mask = (1 << frac_size) - 1;

	const __m256   zero_f     = _mm256_setzero_ps ();
	const __m256   one_f      = _mm256_set1_ps (1);
	const __m256   frac_mul   = _mm256_set1_ps (1.0f / (1 << frac_size));
	const __m256   mul_eps    = _mm256_set1_ps (1.0f / val_min);
	const __m256   mask_abs_f = _mm256_load_ps (
		reinterpret_cast <const float *> (fstb::ToolsAvx2::_mask_abs)
	);

	const __m256i  zero_i          = _mm256_setzero_si256 ();
	const __m256i  mask_abs_epi32  = _mm256_set1_epi32 (0x7FFFFFFF);
	const __m256i  one_epi32       = _mm256_set1_epi32 (1);
	const __m256i  base_epi32      = _mm256_set1_epi32 (int (base));
	const __m256i  frac_mask_epi32 = _mm256_set1_epi32 (frac_mask);
	const __m256i  val_min_epi32   =
		_mm256_set1_epi32 ((LOGLUT_MIN_L2 + exp_bias) << mant_size);
	const __m256i  val_max_epi32   =
		_mm256_set1_epi32 ((LOGLUT_MAX_L2 + exp_bias) << mant_size);
	const __m256i  index_max_epi32 =
		_mm256_set1_epi32 ((LOGLUT_MAX_L2 - LOGLUT_MIN_L2) << LOGLUT_RES_L2);
	const __m256i  hsize_epi32     = _mm256_set1_epi32 (LOGLUT_HSIZE);
	const __m256i  mirror_epi32    = _mm256_set1_epi32 (LOGLUT_HSIZE - 1);

	// It really starts here
	const __m256   val_f = _mm256_load_ps (reinterpret_cast <const float *> (val_arr));
	const __m256   val_a = _mm256_and_ps (val_f, mask_abs_f);
	const __m256i  val_i = _mm256_load_si256 (reinterpret_cast <const __m256i *> (val_arr));
	const __m256i  val_u = _mm256_and_si256 (val_i, mask_abs_epi32);

	// Standard path
	__m256i        index_std = _mm256_sub_epi32 (val_u, base_epi32);
	index_std = _mm256_srli_epi32 (index_std, frac_size);
	index_std = _mm256_add_epi32 (index_std, one_epi32);
	__m256i        frac_stdi = _mm256_and_si256 (val_u, frac_mask_epi32);
	__m256         frac_std  = _mm256_cvtepi32_ps (frac_stdi);
	frac_std  = _mm256_mul_ps (frac_std, frac_mul);

	// Epsilon path
	__m256         frac_eps  = _mm256_max_ps (val_a, zero_f);
	frac_eps = _mm256_mul_ps (frac_eps, mul_eps);

	// Range cases
	const __m256i  eps_flag_i = _mm256_cmpgt_epi32 (val_min_epi32, val_u);
	const __m256i  std_flag_i = _mm256_cmpgt_epi32 (val_max_epi32, val_u);
	const __m256   eps_flag_f = _mm256_castsi256_ps (eps_flag_i);
	const __m256   std_flag_f = _mm256_castsi256_ps (std_flag_i);
	__m256i        index_tmp  =
		fstb::ToolsAvx2::select (std_flag_i, index_std, index_max_epi32);
	__m256         frac_tmp   =
		fstb::ToolsAvx2::select (std_flag_f, frac_std, one_f);
	index_tmp = fstb::ToolsAvx2::select (eps_flag_i, zero_i, index_tmp);
	frac_tmp  = fstb::ToolsAvx2::select (eps_flag_f, frac_eps, frac_tmp);

	// Sign cases
	const __m256i  neg_flag_i = _mm256_srai_epi32 (val_i, 31);
	const __m256   neg_flag_f = _mm256_castsi256_ps (neg_flag_i);
	const __m256i  index_neg  = _mm256_sub_epi32 (mirror_epi32, index_tmp);
	const __m256i  index_pos  = _mm256_add_epi32 (hsize_epi32, index_tmp);
	const __m256   frac_neg   = _mm256_sub_ps (one_f, frac_tmp);
	index = fstb::ToolsAvx2::select (neg_flag_i, index_neg, index_pos);
	frac  = fstb::ToolsAvx2::select (neg_flag_f, frac_neg, frac_tmp);
}
Exemplo n.º 12
0
void Decoder::ADMMDecoder_deg_6_7_2_3_6()
{
	int maxIter          = maxIteration;
	float mu             = 5.5f; 
	float tableau[12]    = { 0.0f };

    if ((mBlocklength == 576) && (mNChecks == 288))
    {
     	mu          = 3.37309f;//penalty
        tableau[2]  = 0.00001f;
	tableau[3]  = 2.00928f;
	tableau[6]  = 4.69438f;

    }
    else if((mBlocklength == 2304) && (mNChecks == 1152) )
    {
    	mu          = 3.81398683f;//penalty
        tableau[2]  = 0.29669288f; 
	tableau[3]  = 0.46964023f;
	tableau[6]  = 3.19548154f;
    }
    else
    {
    	mu          = 5.5;//penalty
        tableau[2]  = 0.8f;
	tableau[3]  = 0.8f;
	tableau[6]  = 0.8f;
    }

    const float rho      = 1.9f;    //over relaxation parameter;
    const float un_m_rho = 1.0 - rho;
    const auto  _rho      = _mm256_set1_ps(      rho );
    const auto  _un_m_rho = _mm256_set1_ps( un_m_rho );
    float tableaX[12];

    //
    // ON PRECALCULE LES CONSTANTES
    //
	#pragma  unroll
    for (int i = 0; i < 7; i++)
    {
        tableaX[i] = tableau[ i ] / mu;
    }
	const auto t_mu    = _mm256_set1_ps ( mu );

	const auto t2_amu  = _mm256_set1_ps (        tableau[ 2 ] / mu   );
	const auto t3_amu  = _mm256_set1_ps (        tableau[ 3 ] / mu   );
	const auto t6_amu  = _mm256_set1_ps (        tableau[ 6 ] / mu   );

	const auto t2_2amu = _mm256_set1_ps ( 2.0f * tableau[ 2 ] / mu );
	const auto t3_2amu = _mm256_set1_ps ( 2.0f * tableau[ 3 ] / mu );
	const auto t6_2amu = _mm256_set1_ps ( 2.0f * tableau[ 6 ] / mu );

	const auto t2_deg  = _mm256_set1_ps ( 2.0f );
	const auto t3_deg  = _mm256_set1_ps ( 3.0f );
	const auto t6_deg  = _mm256_set1_ps ( 6.0f );

	const auto zero    = _mm256_set1_ps ( 0.0f );
	const auto un      = _mm256_set1_ps ( 1.0f );
    const __m256 a     = _mm256_set1_ps ( 0.0f );
    const __m256 b     = _mm256_set1_ps ( 0.5f );

    //////////////////////////////////////////////////////////////////////////////////////
	#pragma  unroll
	for( int j = 0; j < _mPCheckMapSize; j+=8 )
    {
	_mm256_store_ps(&Lambda  [j],         a);
        _mm256_store_ps(&zReplica[j],         b);
        _mm256_store_ps(&latestProjVector[j], b);
    }
    //////////////////////////////////////////////////////////////////////////////////////

	for(int i = 0; i < maxIter; i++)
	{
        int ptr    = 0;
		mIteration = i + 1;

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto start = timer();
		#endif

        //
		// VN processing kernel
		//
		#pragma  unroll
		for (int j = 0; j < _mBlocklength; j++)
        {
            const int degVn = VariableDegree[j];
            float M[8] __attribute__((aligned(64)));

            if( degVn == 2 ){
#if 1
            	const int dVN = 2;
            	for(int qq = 0; qq < 8; qq++) 
		{
    				M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    				#pragma  unroll
    				for(int k = 1; k < dVN; k++) 
				{
    					M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    				}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t2_amu), _mm256_sub_ps(t2_deg, t2_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
            	const int degVN = 2;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }else if( degVn == 3 ){
#if 1
            	const int dVN = 3;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t3_amu), _mm256_sub_ps(t3_deg, t3_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 3;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
		}else if( degVn == 6 ){
#if 1
            	const int dVN = 6;
            	for(int qq = 0; qq < 8; qq++) 
		{
    			M[qq] = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);      ptr   += 1;
    			#pragma  unroll
    			for(int k = 1; k < dVN; k++) 
			{
    				M[qq] += (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]); ptr   += 1;
    			}
    		}
    		const auto m      = _mm256_loadu_ps( M );
    		const auto llr    = _mm256_loadu_ps( &_LogLikelihoodRatio[j] );
    		const auto t1     = _mm256_sub_ps(m, _mm256_div_ps(llr, t_mu));
    		const auto xx     = _mm256_div_ps(_mm256_sub_ps(t1, t6_amu), _mm256_sub_ps(t6_deg, t6_2amu));
    		const auto vMin   = _mm256_max_ps(_mm256_min_ps(xx, un) , zero);
    		_mm256_storeu_ps(&OutputFromDecoder[j], vMin);
    		j += 7;
#else
    		const int degVN = 6;
                float temp = (zReplica[ t_row[ptr] ] + Lambda[ t_row[ptr] ]);
		#pragma unroll
		for(int k = 1; k < degVN; k++)
			temp += (zReplica[ t_row[ptr + k] ] + Lambda[ t_row[ptr + k] ]);
		ptr  += degVN;
	        const float _amu_    = tableaX[ degVN ];
	        const float _2_amu_  = _amu_+ _amu_;
	        const float llr  = _LogLikelihoodRatio[j];
	        const float t    = temp - llr / mu;
	        const float xx   = (t  -  _amu_)/(degVn - _2_amu_);
	        const float vMax = std::min(xx,   1.0f);
	        const float vMin = std::max(vMax, 0.0f);
		OutputFromDecoder[j] = vMin;
#endif
            }
        }

    	//
    	// MEASURE OF THE VN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				t_vn   += (timer() - start);
		#endif

		//
		// CN processing kernel
		//
	int CumSumCheckDegree = 0; // cumulative position of currect edge in factor graph
        int allVerified       = 0;
	float vector_before_proj[8] __attribute__((aligned(64)));

        const auto zero    = _mm256_set1_ps ( 0.0f    );
        const auto mask_6  = _mm256_set_epi32(0x00000000, 0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto mask_7  = _mm256_set_epi32(0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF);
        const auto  dot5   = _mm256_set1_ps(     0.5f );

    	//
    	// MEASURE OF THE CN EXECUTION TIME
    	//
		#ifdef PROFILE_ON
				const auto starT = timer();
		#endif

    	const auto seuilProj = _mm256_set1_ps( 1e-5f );
        for(int j = 0; j < _mNChecks; j++)
		{
            if( CheckDegree[j] == 6 ){
            	const int  cDeg6       = 0x3F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_6), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		int test              = (_mm256_movemask_ps( synd ) & cDeg6);  // deg 6
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  (xpred,      _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified       += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg6) == 0x00; // degree 6

    	        if( skip == false )
    	        {
    	        	const auto _ztemp  = mp.projection_deg6( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
	    	    	const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
	    	    	const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda[CumSumCheckDegree],         mask_6,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica[CumSumCheckDegree],         mask_6,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_6, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
					t_pj   += (timer() - START);
		#endif
                CumSumCheckDegree += 6;

            }else if( CheckDegree[j] == 7 )
	    {
            	const int  cDeg7       = 0x7F;
                const auto offsets    = _mm256_loadu_si256  ((const __m256i*)&t_col1  [CumSumCheckDegree]);
                const auto xpred      = _mm256_mask_i32gather_ps (zero, OutputFromDecoder, offsets, _mm256_castsi256_ps(mask_7), 4);
    		const auto synd       = _mm256_cmp_ps( xpred, dot5,   _CMP_GT_OS );
    		const int  test       = (_mm256_movemask_ps( synd ) & cDeg7); // deg 7
    		const auto syndrom    = _mm_popcnt_u32( test );
    		const auto _Replica   = _mm256_loadu_ps( &zReplica[CumSumCheckDegree]);
    		const auto _ambda     = _mm256_loadu_ps( &Lambda  [CumSumCheckDegree]);
    		const auto v1         = _mm256_mul_ps  ( xpred,    _rho );
    		const auto v2         = _mm256_mul_ps  ( _Replica, _un_m_rho );
    		const auto v3         = _mm256_add_ps  ( v1, v2 );
    		const auto vect_proj  = _mm256_sub_ps  ( v3, _ambda );

                //
                // ON REALISE LA PROJECTION !!!
                //
                allVerified         += ( syndrom & 0x01 );

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
                #ifdef PROFILE_ON
					const auto START = timer();
		#endif
    		const auto latest    = _mm256_loadu_ps(&latestProjVector[CumSumCheckDegree]);
    		const auto different = _mm256_sub_ps ( vect_proj, latest );
    		const auto maskAbsol = _mm256_castsi256_ps(_mm256_set1_epi32(0x7FFFFFFF));
    		const auto absolute  = _mm256_and_ps ( different, maskAbsol );
    	        const auto despass   = _mm256_cmp_ps( absolute, seuilProj, _CMP_GT_OS );
    	        int skip = (_mm256_movemask_ps( despass ) & cDeg7) == 0x00; // degree 7

    	        if( skip == false )
    	        {
			const auto _ztemp  = mp.projection_deg7( vect_proj );
    	    		const auto _ztemp1 = _mm256_sub_ps(_ztemp,    xpred );
    	    		const auto _ztemp2 = _mm256_sub_ps(_ztemp, _Replica );
    	    		const auto _ztemp3 = _mm256_mul_ps(_ztemp1, _rho);
    	    		const auto _ztemp4 = _mm256_mul_ps(_ztemp2, _un_m_rho);
    			const auto nLambda = _mm256_add_ps( _ambda,  _ztemp3 );
    			const auto mLambda = _mm256_add_ps( nLambda, _ztemp4 );
    	    		_mm256_maskstore_ps(&  Lambda        [CumSumCheckDegree], mask_7,   mLambda);
    	    		_mm256_maskstore_ps(&zReplica        [CumSumCheckDegree], mask_7,    _ztemp);
    	        }
	    	_mm256_maskstore_ps(&latestProjVector[CumSumCheckDegree], mask_7, vect_proj);

    	    	//
    	    	// MEASURE OF THE PROJECTION EXECUTION TIME
    	    	//
    	        #ifdef PROFILE_ON
							t_pj   += (timer() - START);
		#endif

                CumSumCheckDegree += 7;

            }else{
                exit( 0 );
            }
        }

    	//
    	// MEASURE OF THE CN LOOP EXECUTION TIME
    	//
        #ifdef PROFILE_ON
				t_cn   += (timer() - starT);
	#endif
	#ifdef PROFILE_ON
	t_ex += 1;
		//FILE *ft=fopen("time.txt","a");
		//fprintf(ft,"%d \n", t_cn/t_ex);
		//fprintf(ft,"%d %d %d \n", t_cn, t_vn, t_pj);
		//fclose(ft);
	#endif
		if(allVerified == 0)
		{
			mAlgorithmConverge = true;
			mValidCodeword     = true;
			break;
		}
	}

	//
	// MEASURE OF THE NUMBER OF EXECUTION
	//
//	#ifdef PROFILE_ON
//		t_ex += 1;
//	#endif

}
Exemplo n.º 13
0
const GSVector4 GSVector4::m_ps0123(0.0f, 1.0f, 2.0f, 3.0f);
const GSVector4 GSVector4::m_ps4567(4.0f, 5.0f, 6.0f, 7.0f);
const GSVector4 GSVector4::m_half(0.5f);
const GSVector4 GSVector4::m_one(1.0f);
const GSVector4 GSVector4::m_two(2.0f);
const GSVector4 GSVector4::m_four(4.0f);
const GSVector4 GSVector4::m_x4b000000(_mm_castsi128_ps(_mm_set1_epi32(0x4b000000)));
const GSVector4 GSVector4::m_x4f800000(_mm_castsi128_ps(_mm_set1_epi32(0x4f800000)));
const GSVector4 GSVector4::m_max(FLT_MAX);
const GSVector4 GSVector4::m_min(FLT_MIN);

#if _M_SSE >= 0x500

const GSVector8 GSVector8::m_half(0.5f);
const GSVector8 GSVector8::m_one(1.0f);
const GSVector8 GSVector8::m_x7fffffff(_mm256_castsi256_ps(_mm256_set1_epi32(0x7fffffff)));
const GSVector8 GSVector8::m_x80000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x80000000)));
const GSVector8 GSVector8::m_x4b000000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4b000000)));
const GSVector8 GSVector8::m_x4f800000(_mm256_castsi256_ps(_mm256_set1_epi32(0x4f800000)));
const GSVector8 GSVector8::m_max(FLT_MAX);
const GSVector8 GSVector8::m_min(FLT_MIN);

#endif

#if _M_SSE >= 0x501

const GSVector8i GSVector8i::m_xff[33] = 
{
	GSVector8i(0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
	GSVector8i(0x000000ff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
	GSVector8i(0x0000ffff, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000, 0x00000000), 
Exemplo n.º 14
0
/* evaluation of 8 sines at onces using AVX intrisics

   The code is the exact rewriting of the cephes sinf function.
   Precision is excellent as long as x < 8192 (I did not bother to
   take into account the special handling they have for greater values
   -- it does not return garbage for arguments over 8192, though, but
   the extra precision is missing).

   Note that it is such that sinf((float)M_PI) = 8.74e-8, which is the
   surprising but correct result.

*/
v8sf sin256_ps(v8sf x) { // any x
  v8sf xmm1, xmm2 = _mm256_setzero_ps(), xmm3, sign_bit, y;
  v8si imm0, imm2;

#ifndef __AVX2__
  v4si imm0_1, imm0_2;
  v4si imm2_1, imm2_2;
#endif

  sign_bit = x;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(v8sf*)_ps256_inv_sign_mask);
  /* extract the sign bit (upper one) */
  sign_bit = _mm256_and_ps(sign_bit, *(v8sf*)_ps256_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(v8sf*)_ps256_cephes_FOPI);

  /*
    Here we start a series of integer operations, which are in the
    realm of AVX2.
    If we don't have AVX, let's perform them using SSE2 directives
  */

#ifdef __AVX2__
  /* store the integer part of y in mm0 */
  imm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  // another two AVX2 instruction
  imm2 = _mm256_add_epi32(imm2, *(v8si*)_pi32_256_1);
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_inv1);
  y = _mm256_cvtepi32_ps(imm2);

  /* get the swap sign flag */
  imm0 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_4);
  imm0 = _mm256_slli_epi32(imm0, 29);
  /* get the polynom selection mask 
     there is one polynom for 0 <= x <= Pi/4
     and another one for Pi/4<x<=Pi/2

     Both branches will be computed.
  */
  imm2 = _mm256_and_si128(imm2, *(v8si*)_pi32_256_2);
  imm2 = _mm256_cmpeq_epi32(imm2,*(v8si*)_pi32_256_0);
#else
  /* we use SSE2 routines to perform the integer ops */
  COPY_IMM_TO_XMM(_mm256_cvttps_epi32(y),imm2_1,imm2_2);

  imm2_1 = _mm_add_epi32(imm2_1, *(v4si*)_pi32avx_1);
  imm2_2 = _mm_add_epi32(imm2_2, *(v4si*)_pi32avx_1);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_inv1);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_inv1);

  COPY_XMM_TO_IMM(imm2_1,imm2_2,imm2);
  y = _mm256_cvtepi32_ps(imm2);

  imm0_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_4);
  imm0_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_4);

  imm0_1 = _mm_slli_epi32(imm0_1, 29);
  imm0_2 = _mm_slli_epi32(imm0_2, 29);

  COPY_XMM_TO_IMM(imm0_1, imm0_2, imm0);

  imm2_1 = _mm_and_si128(imm2_1, *(v4si*)_pi32avx_2);
  imm2_2 = _mm_and_si128(imm2_2, *(v4si*)_pi32avx_2);

  imm2_1 = _mm_cmpeq_epi32(imm2_1, _mm_setzero_si128());
  imm2_2 = _mm_cmpeq_epi32(imm2_2, _mm_setzero_si128());

  COPY_XMM_TO_IMM(imm2_1, imm2_2, imm2);
#endif
 
  v8sf swap_sign_bit = _mm256_castsi256_ps(imm0);
  v8sf poly_mask = _mm256_castsi256_ps(imm2);
  sign_bit = _mm256_xor_ps(sign_bit, swap_sign_bit);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(v8sf*)_ps256_minus_cephes_DP1;
  xmm2 = *(v8sf*)_ps256_minus_cephes_DP2;
  xmm3 = *(v8sf*)_ps256_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);

  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(v8sf*)_ps256_coscof_p0;
  v8sf z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  v8sf tmp = _mm256_mul_ps(z, *(v8sf*)_ps256_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(v8sf*)_ps256_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  v8sf y2 = *(v8sf*)_ps256_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(v8sf*)_ps256_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  return y;
}
Exemplo n.º 15
0
 INLINE avxi unpackhi( const avxi& a, const avxi& b ) { return _mm256_castps_si256(_mm256_unpackhi_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b))); }
Exemplo n.º 16
0
 template<index_t index> INLINE const avxi shuffle( const avxi& a ) {
   return _mm256_castps_si256(_mm256_permute_ps(_mm256_castsi256_ps(a), _MM_SHUFFLE(index, index, index, index)));
 }
Exemplo n.º 17
0
 INLINE const avxi select( const avxb& mask, const avxi& t, const avxi& f ) { return _mm256_castps_si256(_mm256_blendv_ps(_mm256_castsi256_ps(f), _mm256_castsi256_ps(t), mask)); }
Exemplo n.º 18
0
INLINE __m256  _mm256_maskload_ps  (float const *ptr, __m256i mask) {
  return _mm256_maskload_ps(ptr, _mm256_castsi256_ps(mask));
}
Exemplo n.º 19
0
__m256 mm256_cos_ps(__m256 x) {
  __m256 xmm1, xmm2 = _mm256_setzero_ps(), xmm3, y;
  __m256i emm0, emm2;
  /* take the absolute value */
  x = _mm256_and_ps(x, *(__m256*)m256_ps_inv_sign_mask);
  
  /* scale by 4/Pi */
  y = _mm256_mul_ps(x, *(__m256*)m256_ps_cephes_FOPI);

  /* store the integer part of y in mm0 */
  emm2 = _mm256_cvttps_epi32(y);
  /* j=(j+1) & (~1) (see the cephes sources) */
  emm2 = _mm256_add_epi32(emm2, *(__m256i*)m256_pi32_1);
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_inv1);
  y = _mm256_cvtepi32_ps(emm2);

  emm2 = _mm256_sub_epi32(emm2, *(__m256i*)m256_pi32_2);
  
  /* get the swap sign flag */
  emm0 = _mm256_andnot_si256(emm2, *(__m256i*)m256_pi32_4);
  emm0 = _mm256_slli_epi32(emm0, 29);
  /* get the polynom selection mask */
  emm2 = _mm256_and_si256(emm2, *(__m256i*)m256_pi32_2);
  emm2 = _mm256_cmpeq_epi32(emm2, _mm256_setzero_si256());
  
  __m256 sign_bit = _mm256_castsi256_ps(emm0);
  __m256 poly_mask = _mm256_castsi256_ps(emm2);

  /* The magic pass: "******" 
     x = ((x - y * DP1) - y * DP2) - y * DP3; */
  xmm1 = *(__m256*)m256_ps_minus_cephes_DP1;
  xmm2 = *(__m256*)m256_ps_minus_cephes_DP2;
  xmm3 = *(__m256*)m256_ps_minus_cephes_DP3;
  xmm1 = _mm256_mul_ps(y, xmm1);
  xmm2 = _mm256_mul_ps(y, xmm2);
  xmm3 = _mm256_mul_ps(y, xmm3);
  x = _mm256_add_ps(x, xmm1);
  x = _mm256_add_ps(x, xmm2);
  x = _mm256_add_ps(x, xmm3);
  
  /* Evaluate the first polynom  (0 <= x <= Pi/4) */
  y = *(__m256*)m256_ps_coscof_p0;
  __m256 z = _mm256_mul_ps(x,x);

  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p1);
  y = _mm256_mul_ps(y, z);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_coscof_p2);
  y = _mm256_mul_ps(y, z);
  y = _mm256_mul_ps(y, z);
  __m256 tmp = _mm256_mul_ps(z, *(__m256*)m256_ps_0p5);
  y = _mm256_sub_ps(y, tmp);
  y = _mm256_add_ps(y, *(__m256*)m256_ps_1);
  
  /* Evaluate the second polynom  (Pi/4 <= x <= 0) */

  __m256 y2 = *(__m256*)m256_ps_sincof_p0;
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p1);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_add_ps(y2, *(__m256*)m256_ps_sincof_p2);
  y2 = _mm256_mul_ps(y2, z);
  y2 = _mm256_mul_ps(y2, x);
  y2 = _mm256_add_ps(y2, x);

  /* select the correct result from the two polynoms */  
  xmm3 = poly_mask;
  y2 = _mm256_and_ps(xmm3, y2); //, xmm3);
  y = _mm256_andnot_ps(xmm3, y);
  y = _mm256_add_ps(y,y2);
  /* update the sign */
  y = _mm256_xor_ps(y, sign_bit);

  _mm256_zeroupper();
  return y;
}
Exemplo n.º 20
0
inline avx_m256_t newexp_ps(avx_m256_t x) {
	avx_m256_t one = _ps_1;
	avx_m256_t zero = _ps_0;

	x = _mm256_min_ps(x, _ps_exp_hi);
	x = _mm256_max_ps(x, _ps_exp_lo);

	avx_m256_t temp_2 = _mm256_mul_ps(x, _ps_cephes_LOG2EF);
	temp_2 = _mm256_add_ps(temp_2, _ps_0p5);

	avx_m256i_t emm0 = _mm256_cvttps_epi32(temp_2);
	avx_m256_t temp_1 = _mm256_cvtepi32_ps(emm0);
	avx_m256_t temp_3 = _mm256_sub_ps(temp_1, temp_2);
	avx_m256_t mask = _mm256_cmp_ps(temp_3, zero, _CMP_GT_OQ);

	mask = _mm256_and_ps(mask, one);
	temp_2 = _mm256_sub_ps(temp_1, mask);
	emm0 = _mm256_cvttps_epi32(temp_2);

	temp_1 = _mm256_mul_ps(temp_2, _ps_cephes_exp_C12);
	x = _mm256_sub_ps(x, temp_1);

	avx_m256_t x2 = _mm256_mul_ps(x, x);
	avx_m256_t x3 = _mm256_mul_ps(x2, x);
	avx_m256_t x4 = _mm256_mul_ps(x2, x2);
 
	temp_1 = _mm256_add_ps(x, one);
	temp_2 = _mm256_mul_ps(x2, _ps_cephes_exp_p5);
	temp_3 = _mm256_mul_ps(x3, _ps_cephes_exp_p4);
	temp_1 = _mm256_add_ps(temp_1, temp_2);

	temp_2 = _mm256_mul_ps(x3, _ps_cephes_exp_p0);

	temp_1 = _mm256_add_ps(temp_1, temp_3);

	avx_m256_t temp_4 = _mm256_mul_ps(x, _ps_cephes_exp_p2);
	temp_3 = _mm256_mul_ps(x2, _ps_cephes_exp_p1);

	emm0 = _mm256_castps_si256(_mm256_add_ps(_mm256_castsi256_ps(emm0), _mm256_castsi256_ps(_pi32_0x7f)));

	temp_2 = _mm256_add_ps(temp_2, temp_3);
	temp_3 = _mm256_add_ps(temp_3, temp_4);

	//emm0 = _mm256_slli_epi32(emm0, 23);
	// convert emm0 into two 128-bit integer vectors
	// perform shift on both vectors
	// combine both vectors into 256-bit emm0
	__m128i emm0hi = _mm256_extractf128_si256(emm0, 0);
	__m128i emm0lo = _mm256_extractf128_si256(emm0, 1);
	emm0hi = _mm_slli_epi32(emm0hi, 23);
	emm0lo = _mm_slli_epi32(emm0lo, 23);
	emm0 = _mm256_insertf128_si256(emm0, emm0hi, 0);
	emm0 = _mm256_insertf128_si256(emm0, emm0lo, 1);

	avx_m256_t pow2n = _mm256_castsi256_ps(emm0);

	temp_2 = _mm256_add_ps(temp_2, temp_3);
	temp_2 = _mm256_mul_ps(temp_2, x4);

	avx_m256_t y = _mm256_add_ps(temp_1, temp_2);

	y = _mm256_mul_ps(y, pow2n);
	return y;
} // newexp_ps()
Exemplo n.º 21
0
template<> INLINE const avxi shuffle<1, 1, 3, 3>( const avxi& b ) { return _mm256_castps_si256(_mm256_movehdup_ps(_mm256_castsi256_ps(b))); }
double bst_compute_129_m256_maskstore_root_aligned( void*_bst_obj, double* p, double* q, size_t nn ) {
    segments_t* mem = (segments_t*) _bst_obj;
    int n, i, r, l_end, j, l_end_pre;
    double t, e_tmp;
    double* e = mem->e, *w = mem->w;
    int* root = mem->r;
    __m256d v_tmp;
    __m256d v00, v01, v02, v03;
    __m256d v10, v11, v12, v13;
    __m256d v20, v21, v22, v23;
    __m256d v30, v31, v32, v33;
    __m256i v_cur_roots;
    __m256 v_rootmask0, v_rootmask1;
    // initialization
    // mem->n = nn;
    n = nn; // subtractions with n potentially negative. say hello to all the bugs

    int idx1, idx1_root;
    int idx2;
    int idx3, idx3_root;
    int pad_root, pad, pad_r;
    
    idx1      = ((int) mem->e_sz) - 1;
    idx1_root = ((int) mem->r_sz);
    // the conventio is that iteration i, idx1 points to the first element of line i+1
    e[idx1++] = q[n];
    
    // pad contains the padding for row i+1
    // for row n it's always 3
    pad = 3;
    pad_root = 7;
    for (i = n-1; i >= 0; --i) {
        idx1      -= 2*(n-i)+1 + pad;
        idx1_root -= 2*(n-i)+1 + pad_root;
        idx2       = idx1 + 1;
        e[idx1]    = q[i];
        w[idx1]    = q[i];
        for (j = i+1; j < n+1; ++j,++idx2) {
            e[idx2] = INFINITY;
            w[idx2] = w[idx2-1] + p[j-1] + q[j];
        }
        idx2     += pad; // padding of line i+1
        // idx2 now points to the first element of the next line

        idx3      = idx1;
        idx3_root = idx1_root;
        pad_r     = pad;
        for (r = i; r < n; ++r) {
            pad_r     = (pad_r+1)&3; // padding of line r+1
            idx1      = idx3;
            idx1_root = idx3_root;
            l_end     = idx2 + (n-r);
            // l_end points to the first entry after the current row
            e_tmp     = e[idx1++];
            idx1_root++;
            // calculate until a multiple of 8 doubles is left
            // 8 = 4 * 2 128-bit vectors
            l_end_pre = idx2 + ((n-r)&15);
            for( ; (idx2 < l_end_pre) && (idx2 < l_end); ++idx2 ) {
                t = e_tmp + e[idx2] + w[idx1];
                if (t < e[idx1]) {
                    e[idx1] = t;
                    root[idx1_root] = r;
                }
                idx1++;
                idx1_root++;
            }
            
            v_tmp = _mm256_set_pd( e_tmp, e_tmp, e_tmp, e_tmp );
            // execute the shit for 4 vectors of size 2
            v_cur_roots = _mm256_set_epi32(r, r, r, r, r, r, r, r);
            for( ; idx2 < l_end; idx2 += 16 ) {
                v01 = _mm256_load_pd( &w[idx1   ] );
                v11 = _mm256_load_pd( &w[idx1+ 4] );
                v21 = _mm256_load_pd( &w[idx1+ 8] );
                v31 = _mm256_load_pd( &w[idx1+12] );

                v00 = _mm256_load_pd( &e[idx2   ] );
                v01 = _mm256_add_pd( v01, v_tmp ); 
                v10 = _mm256_load_pd( &e[idx2+ 4] );
                v11 = _mm256_add_pd( v11, v_tmp );
                v20 = _mm256_load_pd( &e[idx2+ 8] );
                v21 = _mm256_add_pd( v21, v_tmp );
                v30 = _mm256_load_pd( &e[idx2+12] );
                v31 = _mm256_add_pd( v31, v_tmp );

                v01 = _mm256_add_pd( v01, v00 );
                v03 = _mm256_load_pd( &e[idx1   ] );
                v11 = _mm256_add_pd( v11, v10 );
                v13 = _mm256_load_pd( &e[idx1+ 4] );
                v21 = _mm256_add_pd( v21, v20 );
                v23 = _mm256_load_pd( &e[idx1+ 8] );
                v31 = _mm256_add_pd( v31, v30 );
                v33 = _mm256_load_pd( &e[idx1+12] );

                v02 = _mm256_cmp_pd( v01, v03, _CMP_LT_OQ );
                v12 = _mm256_cmp_pd( v11, v13, _CMP_LT_OQ );
                v22 = _mm256_cmp_pd( v21, v23, _CMP_LT_OQ );
                v32 = _mm256_cmp_pd( v31, v33, _CMP_LT_OQ );

                _mm256_maskstore_pd( &e[idx1   ],
                        _mm256_castpd_si256( v02 ), v01 );
                _mm256_maskstore_pd( &e[idx1+ 4],
                        _mm256_castpd_si256( v12 ), v11 );

                v_rootmask0 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v02)),
                            _mm256_cvtpd_ps(v12) , 1
                    );

                _mm256_maskstore_pd( &e[idx1+ 8],
                        _mm256_castpd_si256( v22 ), v21 );
                _mm256_maskstore_pd( &e[idx1+12], 
                        _mm256_castpd_si256( v32 ), v31 );
                v_rootmask1 = _mm256_insertf128_ps(
                        _mm256_castps128_ps256(
                            _mm256_cvtpd_ps(v22)),
                            _mm256_cvtpd_ps(v32) , 1
                    );
                
                _mm256_maskstore_ps( &root[idx1_root    ],
                        _mm256_castps_si256( v_rootmask0 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                _mm256_maskstore_ps( &root[idx1_root + 8],
                        _mm256_castps_si256( v_rootmask1 ),
                        _mm256_castsi256_ps( v_cur_roots ) );
                idx1      += 16;
                idx1_root += 16;
            }
            idx2 += pad_r;
            idx3++;
            idx3_root++;
        }
        pad      = (pad     -1)&3;
        pad_root = (pad_root-1)&7;
    }
    // the index of the last item of the first row is ((n/4)+1)*4-1, due to the padding
    // if n is even, the total number of entries in the first
    // row of the table is odd, so we need padding
    return e[ ((n/4)+1)*4 - 1 ];
}
Exemplo n.º 23
0
template<> INLINE const avxi shuffle<0, 1, 0, 1>( const avxi& b ) { return _mm256_castps_si256(_mm256_castpd_ps(_mm256_movedup_pd(_mm256_castps_pd(_mm256_castsi256_ps(b))))); }
Exemplo n.º 24
0
 template<index_t index_0, index_t index_1, index_t index_2, index_t index_3> INLINE const avxi shuffle( const avxi& a, const avxi& b ) {
   return _mm256_castps_si256(_mm256_shuffle_ps(_mm256_castsi256_ps(a), _mm256_castsi256_ps(b), _MM_SHUFFLE(index_3, index_2, index_1, index_0)));
 }
Exemplo n.º 25
0
INLINE void    _mm256_maskstore_ps (float *ptr, __m256i mask, __m256 data) {
  _mm256_maskstore_ps(ptr, _mm256_castsi256_ps(mask), data);
}
    real             *fjptrA,*fjptrB,*fjptrC,*fjptrD,*fjptrE,*fjptrF,*fjptrG,*fjptrH;
    real             scratch[4*DIM];
    __m256           tx,ty,tz,fscal,rcutoff,rcutoff2,jidxall;
    real *           vdwioffsetptr0;
    __m256           ix0,iy0,iz0,fix0,fiy0,fiz0,iq0,isai0;
    int              vdwjidx0A,vdwjidx0B,vdwjidx0C,vdwjidx0D,vdwjidx0E,vdwjidx0F,vdwjidx0G,vdwjidx0H;
    __m256           jx0,jy0,jz0,fjx0,fjy0,fjz0,jq0,isaj0;
    __m256           dx00,dy00,dz00,rsq00,rinv00,rinvsq00,r00,qq00,c6_00,c12_00;
    int              nvdwtype;
    __m256           rinvsix,rvdw,vvdw,vvdw6,vvdw12,fvdw,fvdw6,fvdw12,vvdwsum,sh_vdw_invrcut6;
    int              *vdwtype;
    real             *vdwparam;
    __m256           one_sixth   = _mm256_set1_ps(1.0/6.0);
    __m256           one_twelfth = _mm256_set1_ps(1.0/12.0);
    __m256           dummy_mask,cutoff_mask;
    __m256           signbit = _mm256_castsi256_ps( _mm256_set1_epi32(0x80000000) );
    __m256           one     = _mm256_set1_ps(1.0);
    __m256           two     = _mm256_set1_ps(2.0);
    x                = xx[0];
    f                = ff[0];

    nri              = nlist->nri;
    iinr             = nlist->iinr;
    jindex           = nlist->jindex;
    jjnr             = nlist->jjnr;
    shiftidx         = nlist->shift;
    gid              = nlist->gid;
    shiftvec         = fr->shift_vec[0];
    fshift           = fr->fshift[0];
    nvdwtype         = fr->ntype;
    vdwparam         = fr->nbfp;
Exemplo n.º 27
0
template<> INLINE const avxi shuffle<0, 0, 2, 2>( const avxi& b ) { return _mm256_castps_si256(_mm256_moveldup_ps(_mm256_castsi256_ps(b))); }