Point2d CameraPinhole::Project(const Point3d& p3d) { #ifdef __SSE__ if(p3d.z==1.) { __m128d xy = _mm_setr_pd(p3d.x,p3d.y); xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy})); return *(Point2d*)&xy; } else if(p3d.z>0) { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } else return Point2d(-1,-1); #else if(p3d.z==1.) { return Point2d(fx*p3d.x+cx,fy*p3d.y+cy); } else if(p3d.z>0) { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } else return Point2d(-1,-1); #endif }
// compactified (sparse matrix rep) su(2) multiply of, // // | a b || M[row(a)] M[row(a)++] .... | // | c d || M[row(c)] M[row(c)++] .... | // void shortened_su2_multiply( GLU_complex *w , const GLU_complex a , const GLU_complex b , const size_t su2_index ) { register const __m128d A = _mm_setr_pd( creal( a ) , cimag( a ) ) ; register const __m128d B = _mm_setr_pd( creal( b ) , cimag( b ) ) ; const size_t row_a = NC * (int)( Latt.su2_data[ su2_index ].idx_a / NC ) ; const size_t row_c = NC * (int)( Latt.su2_data[ su2_index ].idx_c / NC ) ; register __m128d tmp ; __m128d *w1 = (__m128d*)( w + row_a ) ; __m128d *w2 = (__m128d*)( w + row_c ) ; size_t i ; for( i = 0 ; i < NC ; i++ ) { tmp = *w1 ; *w1 = _mm_add_pd( SSE2_MUL( A , *w1 ) , SSE2_MUL( B , *w2 ) ) ; *w2 = _mm_sub_pd( SSE2_MULCONJ( A , *w2 ) , SSE2_MULCONJ( B , tmp ) ) ; w1++ , w2++ ; } return ; }
// compactified M.su(2)^{\dagger} multiply of, // // | M[col(a)] M[col(b)] | | a b |^{\dagger} // | M[col(a)+NC] M[col(b)+NC] | | c d | // | ..... ....... | // void shortened_su2_multiply_dag( GLU_complex *U , const GLU_complex a , const GLU_complex b , const size_t su2_index ) { // set A and b to be their conjugates register const __m128d A = _mm_setr_pd( creal( a ) , -cimag( a ) ) ; register const __m128d B = _mm_setr_pd( creal( b ) , -cimag( b ) ) ; //GLU_complex U1 , U2 ; // temporaries for caching const size_t col_a = (int)( Latt.su2_data[ su2_index ].idx_a % NC ) ; const size_t col_b = (int)( Latt.su2_data[ su2_index ].idx_b % NC ) ; register __m128d tmp ; __m128d *U1 = (__m128d*)( U + col_a ) ; __m128d *U2 = (__m128d*)( U + col_b ) ; size_t i ; for( i = 0 ; i < NC ; i++ ) { tmp = *U1 ; *U1 = _mm_add_pd( SSE2_MUL( tmp , A ) , SSE2_MUL( *U2 , B ) ) ; *U2 = _mm_sub_pd( SSE2_MUL_CONJ( *U2 , A ) , SSE2_MUL_CONJ( tmp , B ) ) ; U1 += NC ; U2 += NC ; } return ; }
int main(void) { __m128d y2 = _mm_setr_pd(1,2); __m128d y3 = _mm_setr_pd(1,2); int r = _mm_testz_pd(y2, y3); printf("%d\n", r); y2 = _mm_setr_pd(-1,-2); y3 = _mm_setr_pd(-1,-2); r = _mm_testz_pd(y2, y3); printf("%d\n", r); __m256d y0 = _mm256_setr_pd(1,2,3,4); __m256d y1 = _mm256_setr_pd(1,2,3,4); r = _mm256_testz_pd(y0, y1); printf("%d\n", r); //y1 = _mm256_setr_pd(11,2,3,4); y0 = _mm256_setr_pd(-1,-2,-3,-4); y1 = _mm256_setr_pd(-1,-2,-3,-4); r = _mm256_testz_pd(y0, y1); printf("%d\n", r); return 0; }
int main(void) { //_mm256_permute_pd __m256d da = _mm256_setr_pd(1,2,3,4); printf("da: "); for(int i=0; i<sizeof(da)/sizeof(da.m256d_f64[0]); i++) printf("%5.1f ", da.m256d_f64[i]); printf("\n"); __m256d dc = _mm256_permute_pd(da, 0x02); printf("dc: "); for(int i=0; i<sizeof(dc)/sizeof(dc.m256d_f64[0]); i++) printf("%5.1f ", dc.m256d_f64[i]); printf("\n\n"); //_mm_permute_pd __m128d fa = _mm_setr_pd(1, 2); printf("fa: "); for(int i=0; i<sizeof(fa)/sizeof(fa.m128d_f64[0]); i++) printf("%5.1f ", fa.m128d_f64[i]); printf("\n"); __m128d fc = _mm_permute_pd(fa,0x01); printf("fc: "); for(int i=0; i<sizeof(fc)/sizeof(fc.m128d_f64[0]); i++) printf("%5.1f ", fc.m128d_f64[i]); printf("\n"); return 0; }
void init_sse_data() { #ifdef HAVE_SSE if (A_s == 0) { posix_memalign ((void**)&A_s, 16, (sizeof(__m128)*12)); A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 ); A_s[0] = _mm_setr_ps ( 1.0/6.0, -3.0/6.0, 3.0/6.0, -1.0/6.0 ); A_s[1] = _mm_setr_ps ( 4.0/6.0, 0.0/6.0, -6.0/6.0, 3.0/6.0 ); A_s[2] = _mm_setr_ps ( 1.0/6.0, 3.0/6.0, 3.0/6.0, -3.0/6.0 ); A_s[3] = _mm_setr_ps ( 0.0/6.0, 0.0/6.0, 0.0/6.0, 1.0/6.0 ); A_s[4] = _mm_setr_ps ( -0.5, 1.0, -0.5, 0.0 ); A_s[5] = _mm_setr_ps ( 0.0, -2.0, 1.5, 0.0 ); A_s[6] = _mm_setr_ps ( 0.5, 1.0, -1.5, 0.0 ); A_s[7] = _mm_setr_ps ( 0.0, 0.0, 0.5, 0.0 ); A_s[8] = _mm_setr_ps ( 1.0, -1.0, 0.0, 0.0 ); A_s[9] = _mm_setr_ps ( -2.0, 3.0, 0.0, 0.0 ); A_s[10] = _mm_setr_ps ( 1.0, -3.0, 0.0, 0.0 ); A_s[11] = _mm_setr_ps ( 0.0, 1.0, 0.0, 0.0 ); } #endif #ifdef HAVE_SSE2 if (A_d == 0) { posix_memalign ((void**)&A_d, 16, (sizeof(__m128d)*24)); A_d[ 0] = _mm_setr_pd ( 3.0/6.0, -1.0/6.0 ); A_d[ 1] = _mm_setr_pd ( 1.0/6.0, -3.0/6.0 ); A_d[ 2] = _mm_setr_pd ( -6.0/6.0, 3.0/6.0 ); A_d[ 3] = _mm_setr_pd ( 4.0/6.0, 0.0/6.0 ); A_d[ 4] = _mm_setr_pd ( 3.0/6.0, -3.0/6.0 ); A_d[ 5] = _mm_setr_pd ( 1.0/6.0, 3.0/6.0 ); A_d[ 6] = _mm_setr_pd ( 0.0/6.0, 1.0/6.0 ); A_d[ 7] = _mm_setr_pd ( 0.0/6.0, 0.0/6.0 ); A_d[ 8] = _mm_setr_pd ( -0.5, 0.0 ); A_d[ 9] = _mm_setr_pd ( -0.5, 1.0 ); A_d[10] = _mm_setr_pd ( 1.5, 0.0 ); A_d[11] = _mm_setr_pd ( 0.0, -2.0 ); A_d[12] = _mm_setr_pd ( -1.5, 0.0 ); A_d[13] = _mm_setr_pd ( 0.5, 1.0 ); A_d[14] = _mm_setr_pd ( 0.5, 0.0 ); A_d[15] = _mm_setr_pd ( 0.0, 0.0 ); A_d[16] = _mm_setr_pd ( 0.0, 0.0 ); A_d[17] = _mm_setr_pd ( 1.0, -1.0 ); A_d[18] = _mm_setr_pd ( 0.0, 0.0 ); A_d[19] = _mm_setr_pd ( -2.0, 3.0 ); A_d[20] = _mm_setr_pd ( 0.0, 0.0 ); A_d[21] = _mm_setr_pd ( 1.0, -3.0 ); A_d[22] = _mm_setr_pd ( 0.0, 0.0 ); A_d[23] = _mm_setr_pd ( 0.0, 1.0 ); } #endif }
Point2d CameraATAN::Project(const Point3d& p3d) { if(p3d.z<=0) return Point2d(-1,-1); #ifdef __SSE3__ if(useDistortion) { __m128d xy=(__m128d){p3d.x,p3d.y}; if(p3d.z!=1.) { xy=_mm_sub_pd(xy,(__m128d){p3d.z,p3d.z}); } __m128d xy2=_mm_mul_pd(xy,xy); xy2=_mm_hadd_pd(xy2,xy2); xy2=_mm_sqrt_pd(xy2); double r=((Point2d*)&xy2)->x; if(r < 0.001 || d == 0.0) r=1.0; else r=(d_inv* atan(r * tan2w) / r); xy=_mm_mul_pd((__m128d){fx,fy},xy); xy=_mm_mul_pd(xy,(__m128d){r,r}); xy=_mm_add_pd(xy,(__m128d){cx,cy}); return *(Point2d*)&xy; } else { if(p3d.z==1.) { __m128d xy = _mm_setr_pd(p3d.x,p3d.y); xy=_mm_add_pd(_mm_setr_pd(cx,cy),_mm_mul_pd(xy,(__m128d){fx,fy})); return *(Point2d*)&xy; } else if(p3d.z>0) { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } } #else if(useDistortion) { double X=p3d.x,Y=p3d.y; if(p3d.z!=1.) { double z_inv=1./p3d.z; X*=z_inv;Y*=z_inv; } double r= sqrt(X*X+Y*Y); if(r < 0.001 || d == 0.0) r= 1.0; else r=(d_inv* atan(r * tan2w) / r); return Point2d(cx + fx * r * X,cy + fy * r * Y); } else { if(p3d.z==1.) { return Point2d(fx*p3d.x+cx,fy*p3d.y+cy); } else { double z_inv=1./p3d.z; return Point2d(fx*z_inv*p3d.x+cx,fy*z_inv*p3d.y+cy); } } #endif return Point2d(-1,-1);// let compiler happy }
// rotate a matrix U = su2_i*U where su2_i is an su2 matrix embedded in suN void su2_rotate( GLU_complex U[ NCNC ] , const GLU_complex s0 , const GLU_complex s1 , const size_t su2_index ) { #if NC == 3 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; register __m128d tmp0 , tmp1 , a , b ; switch( su2_index%3 ) { // again I don't like this case 0 : // first one a = *( u + 0 ) ; b = *( u + 3 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 3 ) = tmp1 ; // second one a = *( u + 1 ) ; b = *( u + 4 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 4 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 5 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 5 ) = tmp1 ; break ; case 1 : // first one a = *( u + 3 ) ; b = *( u + 6 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 3 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second one a = *( u + 4 ) ; b = *( u + 7 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 4 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 5 ) ; b = *( u + 8 ) ; tmp0 = _mm_add_pd( SSE2_MUL( sm0 , a ) , SSE2_MUL( sm1 , b ) ) ; tmp1 = _mm_sub_pd( SSE2_MULCONJ( sm0 , b ) , SSE2_MULCONJ( sm1 , a ) ) ; *( u + 5 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; case 2 : // first one a = *( u + 0 ) ; b = *( u + 6 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 0 ) = tmp0 ; *( u + 6 ) = tmp1 ; // second a = *( u + 1 ) ; b = *( u + 7 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 1 ) = tmp0 ; *( u + 7 ) = tmp1 ; // third a = *( u + 2 ) ; b = *( u + 8 ) ; tmp0 = _mm_sub_pd( SSE2_MULCONJ( sm0 , a ) , SSE2_MULCONJ( sm1 , b ) ) ; tmp1 = _mm_add_pd( SSE2_MUL( sm0 , b ) , SSE2_MUL( sm1 , a ) ) ; *( u + 2 ) = tmp0 ; *( u + 8 ) = tmp1 ; break ; } #elif NC == 2 __m128d *u = (__m128d*)U ; register const __m128d sm0 = _mm_setr_pd( creal( s0 ) , cimag( s0 ) ) ; register const __m128d sm1 = _mm_setr_pd( creal( s1 ) , cimag( s1 ) ) ; *( u + 0 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 0 ) ) , SSE2_MUL( sm1 , *( u + 2 ) ) ) ; *( u + 1 ) = _mm_add_pd( SSE2_MUL( sm0 , *( u + 1 ) ) , SSE2_MUL( sm1 , *( u + 3 ) ) ) ; *( u + 2 ) = SSE_FLIP( SSE2_CONJ( *( u + 1 ) ) ) ; *( u + 3 ) = SSE2_CONJ( *( u + 0 ) ) ; #else // just a call to su2 multiply shortened_su2_multiply( U , s0 , s1 , su2_index ) ; #endif return ; }
inline vector2d(double f0, double f1) : m_value(_mm_setr_pd(f0, f1)) { }
static inline Simd set(double x, double y, double z, double w) { Simd res; res.reg[0] = _mm_setr_pd(x, y); res.reg[1] = _mm_setr_pd(z, w); return res; }
static inline Simd set(double x, double y) { Simd res; res.reg = _mm_setr_pd(x, y); return res; }
mlib_status __mlib_SignalIIR_Biquad_F32S_F32S( mlib_f32 *dst, const mlib_f32 *src, void *filter, mlib_s32 n) { mlib_s32 i, tmp; mlib_d64 a0, a1, a2, b1, b2, x00, x01, x10, x11, x20, x21, y10, y11, y20, y21, r00, r01; __m128d sa0, sa1, sa2, sb1, sb2; __m128d sx1, sx2, sy1, sy2, sr0, sx0; __m128d stmp1, stmp2, stmp3, stmp4, stmp5; mlib_d64 tr0[2], tx1[2], tx2[2], ty1[2], ty2[2]; mlib_IIR_filt_F32S *pflt = (mlib_IIR_filt_F32S *) filter; if (filter == NULL || src == NULL || dst == NULL) return (MLIB_NULLPOINTER); if (n <= 0) return (MLIB_OUTOFRANGE); n *= 2; a0 = pflt->a0; a1 = pflt->a1; a2 = pflt->a2; b1 = pflt->b1; b2 = pflt->b2; x10 = pflt->x10; x20 = pflt->x20; y10 = pflt->y10; y20 = pflt->y20; x11 = pflt->x11; x21 = pflt->x21; y11 = pflt->y11; y21 = pflt->y21; sa0 = _mm_set1_pd(a0); sa1 = _mm_set1_pd(a1); sa2 = _mm_set1_pd(a2); sb1 = _mm_set1_pd(b1); sb2 = _mm_set1_pd(b2); sx1 = _mm_setr_pd(x10, x11); sx2 = _mm_setr_pd(x20, x21); sx0 = _mm_setr_pd(src[0], src[1]); sy1 = _mm_setr_pd(y10, y11); sy2 = _mm_setr_pd(y20, y21); #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (i = 0; i < n; i += 2) { stmp1 = _mm_mul_pd(sa0, sx0); stmp2 = _mm_mul_pd(sa1, sx1); stmp3 = _mm_mul_pd(sa2, sx2); stmp4 = _mm_mul_pd(sb2, sy2); stmp5 = _mm_mul_pd(sb1, sy1); stmp1 = _mm_add_pd(stmp1, stmp2); stmp3 = _mm_add_pd(stmp3, stmp4); stmp1 = _mm_add_pd(stmp1, stmp3); sr0 = _mm_add_pd(stmp1, stmp5); sx2 = sx1; sx1 = sx0; sx0 = _mm_setr_pd(src[2], src[3]); sy2 = sy1; sy1 = sr0; _mm_storeu_pd(tr0, sr0); dst[i] = tr0[0]; dst[i + 1] = tr0[1]; src += 2; } _mm_storeu_pd(tx1, sx1); _mm_storeu_pd(tx2, sx2); _mm_storeu_pd(ty1, sy1); _mm_storeu_pd(ty2, sy2); pflt->x10 = tx1[0]; pflt->x11 = tx1[1]; pflt->x20 = tx2[0]; pflt->x21 = tx2[1]; pflt->y10 = ty1[0]; pflt->y11 = ty1[1]; pflt->y20 = ty2[0]; pflt->y21 = ty2[1]; return (MLIB_SUCCESS); }
mlib_status __mlib_SignalIIR_Biquad_F32_F32( mlib_f32 *dst, const mlib_f32 *src, void *filter, mlib_s32 n) { mlib_d64 a0, a1, a2, b1, b2, x1, x2, y1, y2, r0, x0; mlib_s32 i, tmp; __m128d sa0, sa1, sa2, sb1, sb2, sx1, sx2, sy1, sy2, sr0, sx0; __m128d stmp1, stmp2, stmp3, stmp4, stmp5; mlib_d64 tr0[2], tx0[2], tx1[2], tx2[2], ty1[2], ty2[2]; mlib_IIR_filt_F32 *pflt = (mlib_IIR_filt_F32 *) filter; if (filter == NULL || src == NULL || dst == NULL) return (MLIB_NULLPOINTER); if (n <= 0) return (MLIB_OUTOFRANGE); a0 = pflt->a0; a1 = pflt->a1; a2 = pflt->a2; b1 = pflt->b1; b2 = pflt->b2; x1 = pflt->x1; x2 = pflt->x2; y1 = pflt->y1; y2 = pflt->y2; for (i = 0; (i < 2) && (i < n); i++) { x0 = src[i]; r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1; x2 = x1; x1 = x0; y2 = y1; y1 = r0; dst[i] = r0; } sa0 = _mm_set1_pd(pflt->a0); sa1 = _mm_set1_pd(pflt->a1); sa2 = _mm_set1_pd(pflt->a2); sb1 = _mm_set1_pd(pflt->b1); sb2 = _mm_set1_pd(pflt->b2); x0 = src[i]; sx0 = _mm_setr_pd(src[i], src[i + 1]); sx1 = _mm_setr_pd(x1, x0); sx2 = _mm_setr_pd(x2, x1); sy1 = _mm_setr_pd(y1, 0); sy2 = _mm_setr_pd(y2, y1); for (; i < n - 1; i += 2) { stmp1 = _mm_mul_pd(sa0, sx0); stmp2 = _mm_mul_pd(sa1, sx1); stmp3 = _mm_mul_pd(sa2, sx2); stmp4 = _mm_mul_pd(sb2, sy2); stmp5 = _mm_mul_pd(sb1, sy1); stmp1 = _mm_add_pd(stmp1, stmp2); stmp3 = _mm_add_pd(stmp3, stmp4); stmp1 = _mm_add_pd(stmp1, stmp3); sr0 = _mm_add_pd(stmp1, stmp5); _mm_storeu_pd(tr0, sr0); tr0[1] += (b1 * tr0[0]); _mm_storeu_pd(tx0, sx0); sx2 = sx0; sx1 = _mm_setr_pd(tx0[1], src[i + 2]); sx0 = _mm_setr_pd(src[i + 2], src[i + 3]); sy1 = _mm_setr_pd(tr0[1], 0); sy2 = _mm_setr_pd(tr0[0], tr0[1]); dst[i] = tr0[0]; dst[i + 1] = tr0[1]; } _mm_storeu_pd(tx1, sx1); x1 = tx1[0]; _mm_storeu_pd(tx2, sx2); x2 = tx2[0]; _mm_storeu_pd(ty1, sy1); y1 = ty1[0]; _mm_storeu_pd(ty2, sy2); y2 = ty2[0]; for (; i < n; i++) { x0 = src[i]; r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1; x2 = x1; x1 = x0; y2 = y1; y1 = r0; dst[i] = r0; } pflt->x1 = x1; pflt->x2 = x2; pflt->y1 = y1; pflt->y2 = y2; return (MLIB_SUCCESS); }
mlib_status __mlib_SignalIIR_Biquad_S16_S16_Sat( mlib_s16 *dst, const mlib_s16 *src, void *filter, mlib_s32 n) { #ifndef MLIB_USE_FTOI_CLAMPING mlib_s32 d; #endif /* MLIB_USE_FTOI_CLAMPING */ mlib_d64 a0, a1, a2, b1, b2, x1, x2, y1, y2, r0, x0; mlib_s32 i, j, tmp; __m128d sa0, sa1, sa2, sb1, sb2, sx1, sx2, sy1, sy2, sr0, sx0; __m128d stmp1, stmp2, stmp3, stmp4, stmp5; mlib_d64 tr0[2], tx0[2], tx1[2], tx2[2], ty1[2], ty2[2]; mlib_IIR_filt_S16 *pflt = (mlib_IIR_filt_S16 *) filter; if (filter == NULL || src == NULL || dst == NULL) return (MLIB_NULLPOINTER); if (n <= 0) return (MLIB_OUTOFRANGE); a0 = pflt->a0; a1 = pflt->a1; a2 = pflt->a2; b1 = pflt->b1; b2 = pflt->b2; x1 = pflt->x1; x2 = pflt->x2; y1 = pflt->y1; y2 = pflt->y2; for (j = 0; (j < n) &&(j < 2); j++) { x0 = src[j]; r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1; x2 = x1; x1 = x0; y2 = y1; y1 = r0; #ifndef MLIB_USE_FTOI_CLAMPING if (r0 > MLIB_S16_MAX) d = MLIB_S16_MAX; else if (r0 < MLIB_S16_MIN) d = MLIB_S16_MIN; else d = (mlib_s16)r0; dst[j] = d; #else /* MLIB_USE_FTOI_CLAMPING */ dst[j] = ((mlib_s16)r0); #endif /* MLIB_USE_FTOI_CLAMPING */ } sa0 = _mm_set1_pd(a0); sa1 = _mm_set1_pd(a1); sa2 = _mm_set1_pd(a2); sb1 = _mm_set1_pd(b1); sb2 = _mm_set1_pd(b2); sx0 = _mm_setr_pd(src[j], src[j + 1]); sx1 = _mm_setr_pd(x1, src[j]); sx2 = _mm_setr_pd(x2, x1); sy1 = _mm_setr_pd(y1, 0); sy2 = _mm_setr_pd(y2, y1); for (i = j; i < n - 1; i += 2) { stmp1 = _mm_mul_pd(sa0, sx0); stmp2 = _mm_mul_pd(sa1, sx1); stmp3 = _mm_mul_pd(sa2, sx2); stmp4 = _mm_mul_pd(sb1, sy1); stmp5 = _mm_mul_pd(sb2, sy2); stmp1 = _mm_add_pd(stmp1, stmp2); stmp3 = _mm_add_pd(stmp3, stmp4); stmp1 = _mm_add_pd(stmp1, stmp3); sr0 = _mm_add_pd(stmp1, stmp5); _mm_storeu_pd(tr0, sr0); tr0[1] += (b1 * tr0[0]); _mm_storeu_pd(tx0, sx0); sx2 = sx0; sx1 = _mm_setr_pd(tx0[1], src[i + 2]); sx0 = _mm_setr_pd(src[i + 2], src[i + 3]); sy1 = _mm_setr_pd(tr0[1], 0); sy2 = _mm_setr_pd(tr0[0], tr0[1]); #ifndef MLIB_USE_FTOI_CLAMPING if (tr0[0] > MLIB_S16_MAX) d = MLIB_S16_MAX; else if (tr0[0] < MLIB_S16_MIN) d = MLIB_S16_MIN; else d = (mlib_s16)tr0[0]; dst[i] = d; #else /* MLIB_USE_FTOI_CLAMPING */ dst[i] = (mlib_s16)tr0[0]; #endif /* MLIB_USE_FTOI_CLAMPING */ #ifndef MLIB_USE_FTOI_CLAMPING if (tr0[1] > MLIB_S16_MAX) d = MLIB_S16_MAX; else if (tr0[1] < MLIB_S16_MIN) d = MLIB_S16_MIN; else d = (mlib_s16)tr0[1]; dst[i + 1] = d; #else /* MLIB_USE_FTOI_CLAMPING */ dst[i + 1] = (mlib_s16)tr0[1]; #endif /* MLIB_USE_FTOI_CLAMPING */ } _mm_storeu_pd(tx1, sx1); x1 = tx1[0]; _mm_storeu_pd(tx2, sx2); x2 = tx2[0]; _mm_storeu_pd(ty1, sy1); y1 = ty1[0]; _mm_storeu_pd(ty2, sy2); y2 = ty2[0]; for (; (i < n); i++) { x0 = src[i]; r0 = a0 * x0 + a1 * x1 + a2 * x2 + b2 * y2 + b1 * y1; x2 = x1; x1 = x0; y2 = y1; y1 = r0; #ifndef MLIB_USE_FTOI_CLAMPING if (r0 > MLIB_S16_MAX) d = MLIB_S16_MAX; else if (r0 < MLIB_S16_MIN) d = MLIB_S16_MIN; else d = (mlib_s16)r0; dst[i] = d; #else /* MLIB_USE_FTOI_CLAMPING */ dst[i] = ((mlib_s16)r0); #endif /* MLIB_USE_FTOI_CLAMPING */ } pflt->x1 = x1; pflt->x2 = x2; pflt->y1 = y1; pflt->y2 = y2; return (MLIB_SUCCESS); }
DBL AVXFMA4Noise(const Vector3d& EPoint, int noise_generator) { DBL x, y, z; DBL *mp; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; DBL sum; // TODO FIXME - global statistics reference // Stats[Calls_To_Noise]++; if (noise_generator==kNoiseGen_Perlin) { // The 1.59 and 0.985 are to correct for some biasing problems with // the random # generator used to create the noise tables. Final // range of values is about 5.0e-4 below 0.0 and above 1.0. Mean // value is 0.49 (ideally it would be 0.5). sum = 0.5 * (1.59 * SolidNoise(EPoint) + 0.985); // Clamp final value to 0-1 range if (sum < 0.0) sum = 0.0; if (sum > 1.0) sum = 1.0; return sum; } x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /* ix = (int)x; iy = (int)y; iz = (long)z; */ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); mp = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL *mp2 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL *mp3 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; DBL *mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL *mp5 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL *mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL *mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL *mp8 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d y_mm = _mm_unpacklo_pd(iy_mm, jy_mm); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p, s_mm; __m128d int_sum1 = _mm_setzero_pd(); s_mm = _mm_mul_pd(mm_txty_txsy, mm_tz); INCRSUMP2(mp, mp2, s_mm, ix_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_txty_txsy, mm_sz); INCRSUMP2(mp3, mp4, s_mm, ix_mm, y_mm, jz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_tz); INCRSUMP2(mp5, mp6, s_mm, jx_mm, y_mm, iz_mm, int_sum1); s_mm = _mm_mul_pd(mm_sxty_sxsy, mm_sz); INCRSUMP2(mp7, mp8, s_mm, jx_mm, y_mm, jz_mm, int_sum1); int_sum1 = _mm_hadd_pd(int_sum1, int_sum1); if(noise_generator==kNoiseGen_RangeCorrected) { /* details of range here: Min, max: -1.05242, 0.988997 Mean: -0.0191481, Median: -0.535493, Std Dev: 0.256828 We want to change it to as close to [0,1] as possible. */ const __m128d r2 = _mm_set_sd(0.48985582); const __m128d r1r2 = _mm_set_sd(1.05242*0.48985582); int_sum1 = _mm_macc_sd(int_sum1, r2, r1r2); } else { int_sum1 = _mm_add_sd(int_sum1, _mm_set_sd(0.5)); } int_sum1 = _mm_min_sd(one, int_sum1); int_sum1 = _mm_max_sd(_mm_setzero_pd(), int_sum1); _mm_store_sd(&sum, int_sum1); return (sum); }
void AVXFMA4DNoise(Vector3d& result, const Vector3d& EPoint) { DBL x, y, z; int ix, iy, iz; int ixiy_hash, ixjy_hash, jxiy_hash, jxjy_hash; // TODO FIXME - global statistics reference // Stats[Calls_To_DNoise]++; x = EPoint[X]; y = EPoint[Y]; z = EPoint[Z]; /* its equivalent integer lattice point. */ /*ix = (int)x; iy = (int)y; iz = (int)z; x_ix = x - ix; y_iy = y - iy; z_iz = z - iz;*/ /* JB fix for the range problem */ __m128d xy = _mm_setr_pd(x, y); __m128d zn = _mm_set_sd(z); __m128d epsy = _mm_set1_pd(1.0 - EPSILON); __m128d xy_e = _mm_sub_pd(xy, epsy); __m128d zn_e = _mm_sub_sd(zn, epsy); __m128i tmp_xy = _mm_cvttpd_epi32(_mm_blendv_pd(xy, xy_e, xy)); __m128i tmp_zn = _mm_cvttpd_epi32(_mm_blendv_pd(zn, zn_e, zn)); __m128i noise_min_xy = _mm_setr_epi32(NOISE_MINX, NOISE_MINY, 0, 0); __m128i noise_min_zn = _mm_set1_epi32(NOISE_MINZ); __m128d xy_ixy = _mm_sub_pd(xy, _mm_cvtepi32_pd(tmp_xy)); __m128d zn_izn = _mm_sub_sd(zn, _mm_cvtepi32_pd(tmp_zn)); const __m128i fff = _mm_set1_epi32(0xfff); __m128i i_xy = _mm_and_si128(_mm_sub_epi32(tmp_xy, noise_min_xy), fff); __m128i i_zn = _mm_and_si128(_mm_sub_epi32(tmp_zn, noise_min_zn), fff); ix = _mm_extract_epi32(i_xy, 0); iy = _mm_extract_epi32(i_xy, 1); iz = _mm_extract_epi32(i_zn, 0); ixiy_hash = Hash2d(ix, iy); jxiy_hash = Hash2d(ix + 1, iy); ixjy_hash = Hash2d(ix, iy + 1); jxjy_hash = Hash2d(ix + 1, iy + 1); DBL* mp1 = &RTable[Hash1dRTableIndex(ixiy_hash, iz)]; DBL* mp2 = &RTable[Hash1dRTableIndex(jxiy_hash, iz)]; DBL* mp3 = &RTable[Hash1dRTableIndex(jxjy_hash, iz)]; DBL* mp4 = &RTable[Hash1dRTableIndex(ixjy_hash, iz)]; DBL* mp5 = &RTable[Hash1dRTableIndex(ixjy_hash, iz + 1)]; DBL* mp6 = &RTable[Hash1dRTableIndex(jxjy_hash, iz + 1)]; DBL* mp7 = &RTable[Hash1dRTableIndex(jxiy_hash, iz + 1)]; DBL* mp8 = &RTable[Hash1dRTableIndex(ixiy_hash, iz + 1)]; const __m128d three = _mm_set1_pd(3.0); const __m128d two = _mm_set1_pd(2.0); const __m128d one = _mm_set1_pd(1.0); __m128d ix_mm = _mm_unpacklo_pd(xy_ixy, xy_ixy); __m128d iy_mm = _mm_unpackhi_pd(xy_ixy, xy_ixy); __m128d iz_mm = _mm_unpacklo_pd(zn_izn, zn_izn); __m128d jx_mm = _mm_sub_pd(ix_mm, one); __m128d jy_mm = _mm_sub_pd(iy_mm, one); __m128d jz_mm = _mm_sub_pd(iz_mm, one); __m128d mm_sz = _mm_mul_pd(_mm_mul_pd(iz_mm, iz_mm), _mm_nmacc_pd(two, iz_mm, three)); __m128d mm_tz = _mm_sub_pd(one, mm_sz); __m128d mm_sxy = _mm_mul_pd(_mm_mul_pd(xy_ixy, xy_ixy), _mm_nmacc_pd(two, xy_ixy, three)); __m128d mm_txy = _mm_sub_pd(one, mm_sxy); __m128d mm_tysy = _mm_unpackhi_pd(mm_txy, mm_sxy); __m128d mm_txty_txsy = _mm_mul_pd(_mm_unpacklo_pd(mm_txy, mm_txy), mm_tysy); __m128d mm_sxty_sxsy = _mm_mul_pd(_mm_unpacklo_pd(mm_sxy, mm_sxy), mm_tysy); __m128d mm_txty_txsy_tz = _mm_mul_pd(mm_txty_txsy, mm_tz); __m128d mm_txty_txsy_sz = _mm_mul_pd(mm_txty_txsy, mm_sz); __m128d mm_sxty_sxsy_tz = _mm_mul_pd(mm_sxty_sxsy, mm_tz); __m128d mm_sxty_sxsy_sz = _mm_mul_pd(mm_sxty_sxsy, mm_sz); __m128d mp_t1, mp_t2, mp1_mm, mp2_mm, mp4_mm, mp6_mm, sum_p; __m128d sum_X_Y = _mm_setzero_pd(); __m128d sum__Z = _mm_setzero_pd(); __m128d mm_s1 = _mm_unpacklo_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp1, mp1 + 8, mm_s1, ix_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s2 = _mm_unpacklo_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp2, mp2 + 8, mm_s2, jx_mm, iy_mm, iz_mm, sum_X_Y); __m128d mm_s3 = _mm_unpackhi_pd(mm_sxty_sxsy_tz, mm_sxty_sxsy_tz); INCRSUMP2(mp3, mp3 + 8, mm_s3, jx_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s4 = _mm_unpackhi_pd(mm_txty_txsy_tz, mm_txty_txsy_tz); INCRSUMP2(mp4, mp4 + 8, mm_s4, ix_mm, jy_mm, iz_mm, sum_X_Y); __m128d mm_s5 = _mm_unpackhi_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp5, mp5 + 8, mm_s5, ix_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s6 = _mm_unpackhi_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp6, mp6 + 8, mm_s6, jx_mm, jy_mm, jz_mm, sum_X_Y); __m128d mm_s7 = _mm_unpacklo_pd(mm_sxty_sxsy_sz, mm_sxty_sxsy_sz); INCRSUMP2(mp7, mp7 + 8, mm_s7, jx_mm, iy_mm, jz_mm, sum_X_Y); __m128d mm_s8 = _mm_unpacklo_pd(mm_txty_txsy_sz, mm_txty_txsy_sz); INCRSUMP2(mp8, mp8 + 8, mm_s8, ix_mm, iy_mm, jz_mm, sum_X_Y); __m128d iy_jy = _mm_unpacklo_pd(iy_mm, jy_mm); INCRSUMP2(mp1 + 16, mp4 + 16, mm_txty_txsy_tz, ix_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp8 + 16, mp5 + 16, mm_txty_txsy_sz, ix_mm, iy_jy, jz_mm, sum__Z); INCRSUMP2(mp2 + 16, mp3 + 16, mm_sxty_sxsy_tz, jx_mm, iy_jy, iz_mm, sum__Z); INCRSUMP2(mp7 + 16, mp6 + 16, mm_sxty_sxsy_sz, jx_mm, iy_jy, jz_mm, sum__Z); sum__Z = _mm_hadd_pd(sum__Z, sum__Z); _mm_storeu_pd(*result, sum_X_Y); _mm_store_sd(&result[Z], sum__Z); }
static int gauss_jordan( GLU_complex M_1[ NCNC ] , const GLU_complex M[ NCNC ] ) { __m128d a[ NCNC ] GLUalign ; // temporary space to overwrite matrix register __m128d best , attempt , m1 , fac ; size_t i , j , piv ; // equate the necessary parts into double complex precision for( i = 0 ; i < NCNC ; i++ ) { a[ i ] = _mm_setr_pd( creal( M[i] ) , cimag( M[i] ) ) ; M_1[ i ] = ( i%(NC+1) ) ? 0.0 :1.0 ; } // set these pointers, pB will be the inverse __m128d *pB = (__m128d*)M_1 , *pA = (__m128d*)a ; // loop over diagonal of the square matrix M for( i = 0 ; i < NC-1 ; i++ ) { // column pivot by selecting the largest in magnitude value piv = i ; best = absfac( *( pA + i*(NC+1) ) ) ; for( j = i+1 ; j < NC ; j++ ) { attempt = absfac( *( pB + i + j*NC ) ) ; if( _mm_ucomilt_sd( best , attempt ) ) { piv = j ; best = attempt ; } } // if we must pivot then we do if( piv != i ) { swap_rows( pA , pB , piv , i ) ; } // perform gaussian elimination to obtain the upper triangular fac = _mm_div_pd( SSE2_CONJ( *( pA + i*(NC+1) ) ) , best ) ; for( j = NC-1 ; j > i ; j-- ) { // go up in other columns eliminate_column( pA , pB , fac , i , j ) ; } } // a is upper triangular, do the same for the upper half // no pivoting to be done here for( i = NC-1 ; i > 0 ; i-- ) { fac = SSE2_inverse( *( pA + i*(NC+1) ) ) ; for( j = 0 ; j < i ; j++ ) { eliminate_column( pA , pB , fac , i , j ) ; } } // multiply each row by its M_1 diagonal for( j = 0 ; j < NC ; j++ ) { m1 = SSE2_inverse( *pA ) ; for( i = 0 ; i < NC ; i++ ) { *pB = SSE2_MUL( *pB , m1 ) ; pB++ ; } pA += NC+1 ; } return GLU_SUCCESS ; }