コード例 #1
0
 inline
 bool any() const
 {
     __m512 buf0 = _mm512_or_ps(val[ 0], val[ 1]);
     return _mm512_test_epi64_mask(
         _mm512_castps_si512(buf0),
         _mm512_castps_si512(buf0));
 }
コード例 #2
0
float inline REDUCE(__m512 val)
{
	float retval;
	__m512 reduce_1 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(val), _MM_PERM_CDAB));
	reduce_1 = _mm512_add_ps(reduce_1, val);
	__m512 reduce_2 = _mm512_castsi512_ps(_mm512_permute4f128_epi32(_mm512_castps_si512(reduce_1), _MM_PERM_AACC));
	reduce_1 = _mm512_add_ps(reduce_1, reduce_2);
	reduce_1 = _mm512_add_ps(reduce_1, _mm512_swizzle_ps(reduce_1, _MM_SWIZ_REG_CDAB));
	reduce_1 = _mm512_add_ps(reduce_1, _mm512_swizzle_ps(reduce_1, _MM_SWIZ_REG_BADC));
	_MM_STORE_SS(&retval, reduce_1);
	return retval;
}
コード例 #3
0
// sin()
static inline mic_m512_t mic_sin_ps(mic_m512_t x) {
	__m512i sign_bit;
	sign_bit = _mm512_and_epi32(_mm512_castps_si512(x), _pi32_sign_mask);
	x = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(x), _pi32_inv_sign_mask));
	
	mic_m512_t y = _mm512_mul_ps(x, _ps_cephes_FOPI);

	__m512i emm2 = _mm512_cvtfxpnt_round_adjustps_epi32(y, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
	emm2 = _mm512_add_epi32(emm2, _pi32_1);
	emm2 = _mm512_and_epi32(emm2, _pi32_inv1);
	y = _mm512_cvtfxpnt_round_adjustepu32_ps(emm2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);

	__m512i emm0 = _mm512_and_epi32(emm2, _pi32_4);
	emm0 = _mm512_slli_epi32(emm0, 29);

	emm2 = _mm512_and_epi32(emm2, _pi32_2);
	__mmask16 mask = _mm512_cmp_epi32_mask(emm2, _pi32_0, _MM_CMPINT_EQ);
	emm2 = _mm512_mask_add_epi32(_pi32_0, mask, _pi32_ffff, _pi32_0);
	
	sign_bit = _mm512_xor_epi32(sign_bit, emm0);
	
	mic_m512_t temp = _ps_minus_cephes_DP123;
	temp = _mm512_mul_ps(y, temp);
	x = _mm512_add_ps(x, temp);

	mic_m512_t x2 = _mm512_mul_ps(x, x);
	mic_m512_t x3 = _mm512_mul_ps(x2, x);
	mic_m512_t x4 = _mm512_mul_ps(x2, x2);

	y = _mm512_mul_ps(_ps_coscof_p0, x2);
	mic_m512_t y2 = _mm512_mul_ps(_ps_sincof_p0, x2);
	y = _mm512_add_ps(y, _ps_coscof_p1);
	y2 = _mm512_add_ps(y2, _ps_sincof_p1);
	y = _mm512_mul_ps(y, x2);
	y2 = _mm512_mul_ps(y2, x2);
	y = _mm512_add_ps(y, _ps_coscof_p2);
	y2 = _mm512_add_ps(y2, _ps_sincof_p2);
	y = _mm512_mul_ps(y, x4);
	y2 = _mm512_mul_ps(y2, x3);
	temp = _mm512_mul_ps(x2, _ps_0point5);
	temp = _mm512_sub_ps(temp, _ps_1);
	y = _mm512_sub_ps(y, temp);
	y2 = _mm512_add_ps(y2, x);

	y = _mm512_castsi512_ps(_mm512_andnot_epi32(emm2, _mm512_castps_si512(y)));
	y2 = _mm512_castsi512_ps(_mm512_and_epi32(emm2, _mm512_castps_si512(y2)));

	y = _mm512_add_ps(y, y2);
	y = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(y), sign_bit));

	return y;
} // sin_ps()
コード例 #4
0
inline void zmatmul(float *ain, float *bin, float *cout) {
#ifdef __MIC__
  __m512 a,b,c;
  __m512 a0,a1,a2,a3;
  __m512 b0,b1,b2,b3;

  __m512i pa0={ 0, 0, 2, 2, 0, 0, 2, 2, 8, 8,10,10, 8, 8,10,10};
  __m512i pa1={ 1, 1, 3, 3, 1, 1, 3, 3, 9, 9,11,11, 9, 9,11,11};
  __m512i pa2={ 4, 4, 6, 6, 4, 4, 6, 6,12,12,14,14,12,12,14,14};
  __m512i pa3={ 5, 5, 7, 7, 5, 5, 7, 7,13,13,15,15,13,13,15,15};

  __m512i pb0={ 0, 1, 0, 1, 4, 5, 4, 5, 0, 1, 0, 1, 4, 5, 4, 5};
  __m512i pb1={ 2, 3, 2, 3, 6, 7, 6, 7, 2, 3, 2, 3, 6, 7, 6, 7};
  __m512i pb2={ 8, 9, 8, 9,12,13,12,13, 8, 9, 8, 9,12,13,12,13};
  __m512i pb3={10,11,10,11,14,15,14,15,10,11,10,11,14,15,14,15};

  a=_mm512_load_ps(ain);
  b=_mm512_load_ps(bin);
  c=_mm512_load_ps(cout);

  a0=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa0,_mm512_castps_si512(a)));
  a1=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa1,_mm512_castps_si512(a)));
  a2=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa2,_mm512_castps_si512(a)));
  a3=_mm512_castsi512_ps(_mm512_permutevar_epi32(pa3,_mm512_castps_si512(a)));

  b0=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb0,_mm512_castps_si512(b)));
  b1=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb1,_mm512_castps_si512(b)));
  b2=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb2,_mm512_castps_si512(b)));
  b3=_mm512_castsi512_ps(_mm512_permutevar_epi32(pb3,_mm512_castps_si512(b)));

  c=_mm512_fmadd_ps(a0,b0,c);
  c=_mm512_fmadd_ps(a1,b1,c);
  c=_mm512_fmadd_ps(a2,b2,c);
  c=_mm512_fmadd_ps(a3,b3,c);

  _mm512_store_ps(cout,c);
#else
  cout[0] +=ain[0] *bin[0]+ain[1] *bin[2]+ain[4] *bin[8] +ain[5] *bin[10];
  cout[1] +=ain[0] *bin[1]+ain[1] *bin[3]+ain[4] *bin[9] +ain[5] *bin[11];
  cout[2] +=ain[2] *bin[0]+ain[3] *bin[2]+ain[6] *bin[8] +ain[7] *bin[10];
  cout[3] +=ain[2] *bin[1]+ain[3] *bin[3]+ain[6] *bin[9] +ain[7] *bin[11];
  cout[4] +=ain[0] *bin[4]+ain[1] *bin[6]+ain[4] *bin[12]+ain[5] *bin[14];
  cout[5] +=ain[0] *bin[5]+ain[1] *bin[7]+ain[4] *bin[13]+ain[5] *bin[15];
  cout[6] +=ain[2] *bin[4]+ain[3] *bin[6]+ain[6] *bin[12]+ain[7] *bin[14];
  cout[7] +=ain[2] *bin[5]+ain[3] *bin[7]+ain[6] *bin[13]+ain[7] *bin[15];
  cout[8] +=ain[8] *bin[0]+ain[9] *bin[2]+ain[12]*bin[8] +ain[13]*bin[10];
  cout[9] +=ain[8] *bin[1]+ain[9] *bin[3]+ain[12]*bin[9] +ain[13]*bin[11];
  cout[10]+=ain[10]*bin[0]+ain[11]*bin[2]+ain[14]*bin[8] +ain[15]*bin[10];
  cout[11]+=ain[10]*bin[1]+ain[11]*bin[3]+ain[14]*bin[9] +ain[15]*bin[11];
  cout[12]+=ain[8] *bin[4]+ain[9] *bin[6]+ain[12]*bin[12]+ain[13]*bin[14];
  cout[13]+=ain[8] *bin[5]+ain[9] *bin[7]+ain[12]*bin[13]+ain[13]*bin[15];
  cout[14]+=ain[10]*bin[4]+ain[11]*bin[6]+ain[14]*bin[12]+ain[15]*bin[14];
  cout[15]+=ain[10]*bin[5]+ain[11]*bin[7]+ain[14]*bin[13]+ain[15]*bin[15];
#endif

}
コード例 #5
0
inline void mic_sincos_ps(mic_m512_t x, mic_m512_t *s, mic_m512_t *c) {
	__m512i sign_bit = _mm512_and_epi32(_mm512_castps_si512(x), _pi32_sign_mask);
	x = _mm512_castsi512_ps(_mm512_and_epi32(_mm512_castps_si512(x), _pi32_inv_sign_mask));

	mic_m512_t y = _mm512_mul_ps(x, _ps_cephes_FOPI);

	__m512i emm2 = _mm512_cvtfxpnt_round_adjustps_epi32(y, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);
	emm2 = _mm512_add_epi32(emm2, _pi32_1);
	emm2 = _mm512_and_epi32(emm2, _pi32_inv1);
	y = _mm512_cvtfxpnt_round_adjustepu32_ps(emm2, _MM_FROUND_TO_NEAREST_INT, _MM_EXPADJ_NONE);

	__m512i cos_emm2 = _mm512_sub_epi32(emm2, _pi32_2);

	__m512i emm0 = _mm512_and_epi32(emm2, _pi32_4);
	__m512i cos_emm0 = _mm512_andnot_epi32(cos_emm2, _pi32_4);
	emm0 = _mm512_slli_epi32(emm0, 29);
	cos_emm0 = _mm512_slli_epi32(cos_emm0, 29);
	sign_bit = _mm512_xor_epi32(sign_bit, emm0);

	emm2 = _mm512_and_epi32(emm2, _pi32_2);
	cos_emm2 = _mm512_and_epi32(cos_emm2, _pi32_2);
	__mmask16 mask = _mm512_cmp_epi32_mask(emm2, _pi32_0, _MM_CMPINT_EQ);
	emm2 = _mm512_mask_add_epi32(_pi32_0, mask, _pi32_ffff, _pi32_0);
	__mmask16 cos_mask = _mm512_cmp_epi32_mask(cos_emm2, _pi32_0, _MM_CMPINT_EQ);
	cos_emm2 = _mm512_mask_add_epi32(_pi32_0, cos_mask, _pi32_ffff, _pi32_0);
	
	x = _mm512_fmadd_ps(y, _ps_minus_cephes_DP123, x);

	mic_m512_t x2 = _mm512_mul_ps(x, x);
	mic_m512_t x3 = _mm512_mul_ps(x2, x);
	mic_m512_t x4 = _mm512_mul_ps(x2, x2);

	y = _mm512_fmadd_ps(_ps_coscof_p0, x2, _ps_coscof_p1);
	y = _mm512_fmadd_ps(y, x2, _ps_coscof_p2);
	mic_m512_t temp_2 = _mm512_fmsub_ps(x2, _ps_0point5, _ps_1);
	y = _mm512_fmsub_ps(y, x4, temp_2);
	mic_m512_t y2 = _mm512_fmadd_ps(_ps_sincof_p0, x2, _ps_sincof_p1);
	y2 = _mm512_fmadd_ps(y2, x2, _ps_sincof_p2);
	y2 = _mm512_fmadd_ps(y2, x3, x);

	mic_m512_t cos_y = y;
	mic_m512_t cos_y2 = y2;

	y = _mm512_castsi512_ps(_mm512_andnot_epi32(emm2, _mm512_castps_si512(y)));
	cos_y = _mm512_castsi512_ps(_mm512_andnot_epi32(cos_emm2, _mm512_castps_si512(cos_y)));
	y2 = _mm512_castsi512_ps(_mm512_and_epi32(emm2, _mm512_castps_si512(y2)));
	cos_y2 = _mm512_castsi512_ps(_mm512_and_epi32(cos_emm2, _mm512_castps_si512(cos_y2)));

	y = _mm512_add_ps(y, y2);
	cos_y = _mm512_add_ps(cos_y, cos_y2);

	*s = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(y), sign_bit));
	*c = _mm512_castsi512_ps(_mm512_xor_epi32(_mm512_castps_si512(cos_y), cos_emm0));
} // sincos_ps()