void gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b,alpha; vector unsigned short al,ah; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); al=vec_mule(a,b); al=vec_add(al,ox0080); ah=vec_mulo(a,b); ah=vec_add(ah,ox0080); al=vec_add(al,vec_sr(al,ox0008)); ah=vec_add(ah,vec_sr(ah,ox0008)); d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
vector signed char test1_or (vector bool char x, vector signed char y) { vector signed char *foo; *foo += vec_or (x, y); return *foo; }
static inline vec_uint4 GENBX(vec_uint4 a, vec_uint4 b, vec_uint4 c) { return vec_and(vec_or(vec_cmpgt(a, b), vec_and(vec_cmpeq(a, b), c)), vec_splat_u32(1)); }
vector unsigned char test6_or (vector unsigned char x, vector unsigned char y) { vector unsigned char *foo; *foo += vec_or (x, y); return *foo; }
void foo (vector bool long long *vblr, vector double *vdr, vector unsigned long long *vullz, vector double *vdz, vector bool char *vbcz, vector signed char *vscz, vector unsigned char *vucz, vector bool int *vbiz, vector int *viz, vector unsigned int *vuiz, vector signed long long int *vslliz, vector bool short int *vbsiz, vector signed short int *vssiz, vector unsigned short int *vusiz, vector float *vfz) { *vblr++ = vec_andc (vbla, vblb); *vdr++ = vec_double (vslla); *vdr++ = vec_double (vulla); *vblr++ = vec_mergeh (vbla, vblb); *vblr++ = vec_mergel (vbla, vblb); *vblr++ = vec_nor (vbla, vblb); *vblr++ = vec_or (vbla, vblb); *vblr++ = vec_sel (vbla, vblb, vblc); *vblr++ = vec_sel (vbla, vblb, vullc); *vblr++ = vec_xor (vbla, vblb); *vullz++ = vec_sel (vulla, vullb, vbllc); *vullz++ = vec_sel (vulla, vullb, vullc); *vdz++ = vec_sel(vda, vdb, vullc); *vbcz++ = vec_sel (vbca, vbcb, vbcc); *vbcz++ = vec_sel (vbca, vbcb, vucc); *vbcz++ = vec_xor (vbca, vbcb); *vscz++ = vec_sel (vsca, vscb, vbcc); *vscz++ = vec_sel (vsca, vscb, vucc); *vucz++ = vec_sel (vuca, vucb, vbcc); *vucz++ = vec_sel (vuca, vucb, vucc); *vbiz++ = vec_sel (vbia, vbib, vbic); *vbiz++ = vec_sel (vbia, vbib, vuic); *vbiz++ = vec_xor (vbia, vbib); *viz++ = vec_sel (vsia, vsib, vbic); *viz++ = vec_sel (vsia, vsib, vuic); *vuiz++ = vec_sel (vuia, vuib, vbic); *vuiz++ = vec_sel (vuia, vuib, vuic); *vslliz++ = vec_sel(vslla, vsllb, vbllc); *vslliz++ = vec_sel(vslla, vsllb, vullc); *vssiz++ = vec_sel(vssia, vssib, vbsic); *vssiz++ = vec_sel(vssia, vssib, vusic); *vusiz++ = vec_sel(vusia, vusib, vbsic); *vusiz++ = vec_sel(vusia, vusib, vusic); *vbsiz++ = vec_sel (vbsia, vbsib, vbsic); *vbsiz++ = vec_sel (vbsia, vbsib, vusic); *vbsiz++ = vec_xor (vbsia, vbsib); *vdz++ = vec_sel (vda, vdb, vbllc); *vfz++ = vec_sel (vfa, vfb, vbic); *vfz++ = vec_sel (vfa, vfb, vuic); *vfz++ = vec_xor (vfa, vfb); }
// out: o = |x-y| < a static inline vec_u8_t diff_lt_altivec( register vec_u8_t x, register vec_u8_t y, register vec_u8_t a ) { register vec_u8_t diff = vec_subs(x, y); register vec_u8_t diffneg = vec_subs(y, x); register vec_u8_t o = vec_or(diff, diffneg); /* |x-y| */ o = (vec_u8_t)vec_cmplt(o, a); return o; }
// out: o = |x-y| < a static inline vector unsigned char diff_lt_altivec ( register vector unsigned char x, register vector unsigned char y, register vector unsigned char a) { register vector unsigned char diff = vec_subs(x, y); register vector unsigned char diffneg = vec_subs(y, x); register vector unsigned char o = vec_or(diff, diffneg); /* |x-y| */ o = (vector unsigned char)vec_cmplt(o, a); return o; }
__SIMDd _SIMD_or_pd(__SIMDd a, __SIMDd b) { #ifdef USE_SSE return _mm_or_pd(a,b); #elif defined USE_AVX return _m256_or_ps(a,b); #elif defined USE_IBM return vec_or(a,b); #endif }
void TurnJavaModeOff(void) { vector unsigned int javaOffMask = ( vector unsigned int ) ( 0x00010000 ); vector unsigned int java; gOldJavaMode = ( vector unsigned int ) vec_mfvscr ( ); java = vec_or ( gOldJavaMode, javaOffMask ); vec_mtvscr ( java ); }
template <bool align> v128_u8 LbpEstimate(const uint8_t * src, ptrdiff_t stride) { v128_u8 threshold = Load<false>(src); v128_u8 lbp = K8_00; lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<align>(src - 1 - stride), threshold), K8_01)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src - stride), threshold), K8_02)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + 1 - stride), threshold), K8_04)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + 1 ), threshold), K8_08)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + 1 + stride), threshold), K8_10)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<false>(src + stride), threshold), K8_20)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<align>(src - 1 + stride), threshold), K8_40)); lbp = vec_or(lbp, vec_and(GreaterOrEqual(Load<align>(src - 1 ), threshold), K8_80)); return lbp; }
void foo (vector bool long long *vblr, vector double *vdr) { *vblr++ = vec_andc (vbla, vblb); *vdr++ = vec_double (vsla); *vdr++ = vec_double (vula); *vblr++ = vec_mergeh (vbla, vblb); *vblr++ = vec_mergel (vbla, vblb); *vblr++ = vec_nor (vbla, vblb); *vblr++ = vec_or (vbla, vblb); *vblr++ = vec_sel (vbla, vblb, vblc); *vblr++ = vec_sel (vbla, vblb, vulc); *vblr++ = vec_xor (vbla, vblb); }
int main () { vector float fa = {1.0, 2.0, 3.0, -4.0}; vector float fb = {-2.0, -3.0, -4.0, -5.0}; vector float fc = vec_cpsgn (fa, fb); vector long long la = {5L, 14L}; vector long long lb = {3L, 86L}; vector long long lc = vec_and (la, lb); vector bool long long ld = {0, -1}; vector long long le = vec_and (la, ld); vector long long lf = vec_and (ld, lb); vector unsigned long long ua = {5L, 14L}; vector unsigned long long ub = {3L, 86L}; vector unsigned long long uc = vec_and (ua, ub); vector bool long long ud = {0, -1}; vector unsigned long long ue = vec_and (ua, ud); vector unsigned long long uf = vec_and (ud, ub); vector long long lg = vec_andc (la, lb); vector long long lh = vec_andc (la, ld); vector long long li = vec_andc (ld, lb); vector unsigned long long ug = vec_andc (ua, ub); vector unsigned long long uh = vec_andc (ua, ud); vector unsigned long long ui = vec_andc (ud, ub); vector double da = {1.0, -4.0}; vector double db = {-2.0, 5.0}; vector double dc = vec_cpsgn (da, db); vector long long lj = vec_mergeh (la, lb); vector long long lk = vec_mergeh (la, ld); vector long long ll = vec_mergeh (ld, la); vector unsigned long long uj = vec_mergeh (ua, ub); vector unsigned long long uk = vec_mergeh (ua, ud); vector unsigned long long ul = vec_mergeh (ud, ua); vector long long lm = vec_mergel (la, lb); vector long long ln = vec_mergel (la, ld); vector long long lo = vec_mergel (ld, la); vector unsigned long long um = vec_mergel (ua, ub); vector unsigned long long un = vec_mergel (ua, ud); vector unsigned long long uo = vec_mergel (ud, ua); vector long long lp = vec_nor (la, lb); vector long long lq = vec_nor (la, ld); vector long long lr = vec_nor (ld, la); vector unsigned long long up = vec_nor (ua, ub); vector unsigned long long uq = vec_nor (ua, ud); vector unsigned long long ur = vec_nor (ud, ua); vector long long ls = vec_or (la, lb); vector long long lt = vec_or (la, ld); vector long long lu = vec_or (ld, la); vector unsigned long long us = vec_or (ua, ub); vector unsigned long long ut = vec_or (ua, ud); vector unsigned long long uu = vec_or (ud, ua); vector unsigned char ca = {0,4,8,1,5,9,2,6,10,3,7,11,15,12,14,13}; vector long long lv = vec_perm (la, lb, ca); vector unsigned long long uv = vec_perm (ua, ub, ca); vector long long lw = vec_sel (la, lb, lc); vector long long lx = vec_sel (la, lb, uc); vector long long ly = vec_sel (la, lb, ld); vector unsigned long long uw = vec_sel (ua, ub, lc); vector unsigned long long ux = vec_sel (ua, ub, uc); vector unsigned long long uy = vec_sel (ua, ub, ld); vector long long lz = vec_xor (la, lb); vector long long l0 = vec_xor (la, ld); vector long long l1 = vec_xor (ld, la); vector unsigned long long uz = vec_xor (ua, ub); vector unsigned long long u0 = vec_xor (ua, ud); vector unsigned long long u1 = vec_xor (ud, ua); int ia = vec_all_eq (ua, ub); int ib = vec_all_ge (ua, ub); int ic = vec_all_gt (ua, ub); int id = vec_all_le (ua, ub); int ie = vec_all_lt (ua, ub); int ig = vec_all_ne (ua, ub); int ih = vec_any_eq (ua, ub); int ii = vec_any_ge (ua, ub); int ij = vec_any_gt (ua, ub); int ik = vec_any_le (ua, ub); int il = vec_any_lt (ua, ub); int im = vec_any_ne (ua, ub); vector int sia = {9, 16, 25, 36}; vector int sib = {-8, -27, -64, -125}; vector int sic = vec_mergee (sia, sib); vector int sid = vec_mergeo (sia, sib); vector unsigned int uia = {9, 16, 25, 36}; vector unsigned int uib = {8, 27, 64, 125}; vector unsigned int uic = vec_mergee (uia, uib); vector unsigned int uid = vec_mergeo (uia, uib); vector bool int bia = {0, -1, -1, 0}; vector bool int bib = {-1, -1, 0, -1}; vector bool int bic = vec_mergee (bia, bib); vector bool int bid = vec_mergeo (bia, bib); vector unsigned int uie = vec_packsu (ua, ub); vector long long l2 = vec_cntlz (la); vector unsigned long long u2 = vec_cntlz (ua); vector int sie = vec_cntlz (sia); vector unsigned int uif = vec_cntlz (uia); vector short ssa = {20, -40, -60, 80, 100, -120, -140, 160}; vector short ssb = vec_cntlz (ssa); vector unsigned short usa = {81, 72, 63, 54, 45, 36, 27, 18}; vector unsigned short usb = vec_cntlz (usa); vector signed char sca = {-4, 3, -9, 15, -31, 31, 0, 0, 1, 117, -36, 99, 98, 97, 96, 95}; vector signed char scb = vec_cntlz (sca); vector unsigned char cb = vec_cntlz (ca); vector double dd = vec_xl (0, &y); vec_xst (dd, 0, &z); vector double de = vec_round (dd); vector double df = vec_splat (de, 0); vector double dg = vec_splat (de, 1); vector long long l3 = vec_splat (l2, 0); vector long long l4 = vec_splat (l2, 1); vector unsigned long long u3 = vec_splat (u2, 0); vector unsigned long long u4 = vec_splat (u2, 1); vector bool long long l5 = vec_splat (ld, 0); vector bool long long l6 = vec_splat (ld, 1); vector long long l7 = vec_div (l3, l4); vector unsigned long long u5 = vec_div (u3, u4); vector long long l8 = vec_mul (l3, l4); vector unsigned long long u6 = vec_mul (u3, u4); vector double dh = vec_ctf (la, -2); vector double di = vec_ctf (ua, 2); vector long long l9 = vec_cts (dh, -2); vector unsigned long long u7 = vec_ctu (di, 2); return 0; }
void test1() { // CHECK-LABEL: define void @test1 // CHECK-LE-LABEL: define void @test1 res_vf = vec_abs(vf); // CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> // CHECK-LE: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> // CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) // CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) // CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_cpsgn */ res_vf = vec_cpsgn(vf, vf); // CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) // CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}}) res_vd = vec_cpsgn(vd, vd); // CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) // CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}}) /* vec_div */ res_vsll = vec_div(vsll, vsll); // CHECK: sdiv <2 x i64> // CHECK-LE: sdiv <2 x i64> res_vull = vec_div(vull, vull); // CHECK: udiv <2 x i64> // CHECK-LE: udiv <2 x i64> res_vf = vec_div(vf, vf); // CHECK: fdiv <4 x float> // CHECK-LE: fdiv <4 x float> res_vd = vec_div(vd, vd); // CHECK: fdiv <2 x double> // CHECK-LE: fdiv <2 x double> /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp // CHECK-LE: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp // CHECK-LE: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp // CHECK-LE: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp // CHECK-LE: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp // CHECK-LE: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp // CHECK-LE: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbll = vec_perm(vbll, vbll, vuc); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vf = vec_round(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> res_vd = vec_round(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_splat(vd, 1); // CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vbll = vec_splat(vbll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsll = vec_splat(vsll, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vull = vec_splat(vull, 1); // CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> // CHECK-LE: xor <16 x i8> // CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32> // CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8> res_vsi = vec_pack(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vui = vec_pack(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vbi = vec_pack(vbll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x // CHECK-LE: @llvm.ppc.vsx.lxvd2x res_vss = vec_vsx_ld(0, &vss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vss = vec_vsx_ld(0, &ss); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &vus); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vus = vec_vsx_ld(0, &us); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vbc = vec_vsx_ld(0, &vbc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &vsc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &vuc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vsc = vec_vsx_ld(0, &sc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x res_vuc = vec_vsx_ld(0, &uc); // CHECK: @llvm.ppc.vsx.lxvw4x // CHECK-LE: @llvm.ppc.vsx.lxvw4x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsi, 0, &res_si); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_ui); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x // CHECK-LE: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vss, 0, &res_vss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vss, 0, &res_ss); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_vus); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vus, 0, &res_us); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_vsc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_vuc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vuc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_vbc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_sc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vbc, 0, &res_uc); // CHECK: @llvm.ppc.vsx.stxvw4x // CHECK-LE: @llvm.ppc.vsx.stxvw4x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> // CHECK-LE: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> // CHECK-LE: xor <2 x i64> // CHECK-LE: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) /* vec_mergeh */ res_vsll = vec_mergeh(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergeh(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergeh(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_mergel */ res_vsll = vec_mergel(vsll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vsll, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vsll = vec_mergel(vbll, vsll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vull, vbll); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm res_vull = vec_mergel(vbll, vull); // CHECK: @llvm.ppc.altivec.vperm // CHECK-LE: @llvm.ppc.altivec.vperm /* vec_msub */ res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vsll = vec_mul(vsll, vsll); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vull = vec_mul(vull, vull); // CHECK: mul <2 x i64> // CHECK-LE: mul <2 x i64> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] // CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> // CHECK-LE: or <2 x i64> // CHECK-LE: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_or(vbll, vd); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]] // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vd = vec_or(vd, vbll); // CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[T2]] to <2 x double> // CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double> res_vf = vec_re(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float> res_vd = vec_re(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double> res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> // CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK-LE: and <2 x i64> %{{[0-9]+}}, // CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: or <2 x i64> // CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) // CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) // CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> // CHECK-LE: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() // CHECK-LE: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> // CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> // CHECK-LE: xor <2 x i64> res_vsll = vec_cts(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_cts(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptosi <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 0); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vsll = vec_ctu(vd, 31); // CHECK: fmul <2 x double> // CHECK: fptoui <2 x double> %{{.*}} to <2 x i64> // CHECK-LE: fmul <2 x double> // CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64> res_vd = vec_ctf(vsll, 0); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vsll, 31); // CHECK: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 0); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> res_vd = vec_ctf(vull, 31); // CHECK: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK: fmul <2 x double> // CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double> // CHECK-LE: fmul <2 x double> }
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL) { int i; vector signed short vsrc; uint16_t *qmat; vector unsigned short vqmat; vector unsigned short vmquant; vector bool short eqzero, ltzero; vector signed short val, t0; vector signed short zero, one; vector unsigned int four; vector signed short min, max; int offset, offset2; int16_t dst0; union { vector unsigned short vu16; unsigned short mquant; vector signed int vs32; struct { signed int pad[3]; signed int sum; } s; } vu; #ifdef ALTIVEC_DST DataStreamControl dsc; #endif #ifdef ALTIVEC_VERIFY /* {{{ */ if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat)) mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)", wsp->intra_q_mat); if (NOT_VECTOR_ALIGNED(src)) mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src); if (NOT_VECTOR_ALIGNED(dst)) mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst); for (i = 0; i < 64; i++) if (src[i] < -256 || src[i] > 255) mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)", i, src[i]); #endif /* }}} */ AMBER_START; dst0 = src[0] << (3 - dc_prec); qmat = (uint16_t*)wsp->intra_q_mat; #ifdef ALTIVEC_DST dsc.control = DATA_STREAM_CONTROL(64/8,1,0); vec_dst(src, dsc.control, 0); vec_dst(qmat, dsc.control, 1); #endif /* vmquant = (vector unsigned short)(mquant); */ vu.mquant = (unsigned short)mquant; vmquant = vec_splat(vu.vu16, 0); zero = vec_splat_s16(0); one = vec_splat_s16(1); four = vec_splat_u32(4); /* max = (2047); min = (-2048); {{{ */ vu8(max) = vec_splat_u8(0x7); t0 = vec_splat_s16(-1); /* 0xffff */ vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */ min = vec_sub(t0, max); /* }}} */ offset = 0; #if 1 vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); i = (64/8) - 1; do { /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); offset2 = offset; offset += 8*sizeof(int16_t); vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset2, dst); } while (--i); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); #else /* {{{ */ i = (64/8); do { vsrc = vec_ld(offset, (signed short*)src); vqmat = vec_ld(offset, (unsigned short*)qmat); /* intra_q[i] * mquant */ vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant)); /* save sign */ ltzero = vec_cmplt(vsrc, zero); eqzero = vec_cmpeq(vsrc, zero); /* val = abs(src) */ t0 = vec_sub(zero, vsrc); val = vec_max(t0, vsrc); /* val = (src * quant) >> 4 */ vs32(t0) = vec_mule(val, vs16(vqmat)); vs32(val) = vec_mulo(val, vs16(vqmat)); vs32(t0) = vec_sra(vs32(t0), four); vs16(t0) = vec_pack(vs32(t0), vs32(t0)); vs32(val) = vec_sra(vs32(val), four); vs16(val) = vec_pack(vs32(val), vs32(val)); val = vec_mergeh(vs16(t0), vs16(val)); /* val = val - 1&~(val|val==0) */ t0 = vec_or(val, eqzero); t0 = vec_andc(one, t0); val = vec_sub(val, t0); /* restore sign */ t0 = vec_sub(zero, val); val = vec_sel(val, t0, ltzero); /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */ val = vec_min(val, max); val = vec_max(val, min); vec_st(val, offset, dst); offset += 8*sizeof(int16_t); } while (--i); /* }}} */ #endif dst[0] = dst0; AMBER_STOP; }
int main(int argc, char **argv) { time_t startTime = time(NULL); // setup, assign particles initla positions and masses // this is done in scalar fashion, NOT SIMD // insignificant to performance since it's only done once struct timeval start; gettimeofday(&start,NULL); //seed random generator srand( time(NULL) ); printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n"); int pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { int grideSize = GRID_SIZE; // printf("\n grideSize/2: %d", grideSize/2); float xPos = (float)( rand() % grideSize - grideSize/2); float yPos = (float)( rand() % grideSize - grideSize/2); float zPos = (float)( rand() % grideSize - grideSize/2); particle_Array[pC].position[0] = xPos; particle_Array[pC].position[1] = yPos; particle_Array[pC].position[2] = zPos; particle_Array[pC].velocity[3] = PARTICLES_DEFAULTMASS; //particle_Array[pC].position = vec_splat(particle_Array[pC].position, 1); //particle_Array[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster printf("Particle %d: ", pC ); printf("x= %f, y=%f, z=%f", particle_Array[pC].position[0], particle_Array[pC].position[1], particle_Array[pC].position[2]); printf("\n"); } ///main loop // temp particle Datas used for calculations, not pointers, purposefully passed by value particle_Data pDi; particle_Data pDj; //temp vectors used for calculations in loop __vector float tempAcceleration = {0,0,0,0}; __vector float tempVelocity = {0,0,0,0}; __vector float tempDistance = {0,0,0,0}; //--> use 4th element to store radius __vector float tempDistanceRL1 = {0,0,0,0}; __vector float tempDistanceRL2 = {0,0,0,0}; __vector float tempNumerator = {0,0,0,0}; __vector float tempMassSplat = {0,0,0,0}; __vector float tempGConstant = {GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT }; __vector float tempDELATTIME = {DELTA_TIME, DELTA_TIME, DELTA_TIME, DELTA_TIME}; __vector float tempEPS= {EPS, EPS, EPS, EPS}; __vector float zeroVector = {0,0,0,0}; __vector unsigned int oneVector = {1,1,1,1}; __vector unsigned int axisBitShiftMask = {0,1,2,0}; __vector unsigned char yzxwMask = { 4,5,6,7, 8,9,10,11, 0,1,2,3, 12,13,14,15}; __vector unsigned char zxywMask = { 8,9,10,11, 0,1,2,3, 4,5,6,7, 12,13,14,15}; __vector unsigned short resetOctantCount = {0,0,0,0,0,0,0}; __vector unsigned short increment = {1,1,1,1,1,1,1,1}; __vector float tempUnitVector = {0,0,0,0}; __vector float distanceVector = {0,0,0,0}; //stupid C99, need to declare indicies before for loops int i = 0; int j = 0; int it_counter = 0; printf("\n^^^^^^^ Now starting main loop\n\n\n"); for(it_counter = 0; it_counter < ITERATION_COUNT; ++it_counter) { octantCount = resetOctantCount; // printf("\nIteration: %d\n",it_counter ); // this first loop is to calculate the forces/accelerations // NOTE ---> NO FORCES ARE APPLIED IN THIS LOOP, NO POSITIONS WILL BE CHANGED. // The calculated accelerations will be used to increment the particles velocity vector, NOT POSITION for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { //cache the particle data struct to the temp declared outside the loops pDi = particle_Array[i]; for(j = 0; j<PARTICLES_MAXCOUNT; ++j) { //for every particle i, calculate for all j's // get resultant total velocity, don't apply it in these loops, // apply velocities for all bodies at the same time, in seperate loop at the end. //cache the particle data struct to the temp declared outside the loops pDj = particle_Array[j]; // Formula being used --> a = (G * m )/(r^2) tempDistance = vec_sub(pDj.position,pDi.position); //actual distance vector between objects i and j // save value for unit vector calculation later distanceVector = tempDistance; /* //Print distances between particles printf("Particle %d: ", i ); printf("x= %f, y=%f, z=%f", tempDistance[0], tempDistance[1], tempDistance[2]); printf("\n"); */ //use the distance vector right now for numerator, before we overwrite is later in the code // use mass of subject mass tempMassSplat = vec_splats((float)pDi.velocity[3]); //mass is stored in the last element (3) of velocity vector tempNumerator = vec_madd(tempMassSplat, tempGConstant, zeroVector); /* //Print numerator printf("Numerator %d: ", i ); printf("x= %f, y=%f, z=%f", tempNumerator[0], tempNumerator[1], tempNumerator[2]); printf("\n"); */ //Assembly for vector rotate //__asm__("addi 4,4,1;"); // denominator part // sqaure each component, x,y,z beforehand tempDistance = vec_madd(tempDistance, tempDistance, zeroVector); //using perm instead of rotate, bleurg tempDistanceRL1 = vec_perm(tempDistance, zeroVector, yzxwMask); // imitates lxfloat left rotate tempDistanceRL2 = vec_perm(tempDistance, zeroVector, zxywMask); // imitates 2xfloat left rotate //add both tempDistanceRL1 = vec_add(tempDistanceRL1, tempDistanceRL2); //add to original to get total ---> x+y+z tempDistance = vec_add(tempDistance, tempDistanceRL1); //tempDistance is now total distance squared // add EPS to avoid singularity tempDistance = vec_add(tempDistance, tempEPS); //this is now the denominator value //save inverse magnitude for unit vector later tempUnitVector = vec_rsqrte(tempDistance); // invert vector to avoid division later tempDistance = vec_re(tempDistance); // this is final denominator (already inverted), only need to multiply // tempDistance is now eqivalent to 1/r^2 /* //Print denominator printf("Denominator %d: ", i ); printf("x= %f, y=%f, z=%f", tempDistance[0], tempDistance[1], tempDistance[2]); printf("\n"); */ //total acceleration applied to particle i, by particle j tempAcceleration = vec_madd(tempDistance, tempNumerator, zeroVector); // create unit vector tempUnitVector = vec_madd(distanceVector, tempUnitVector, zeroVector); // apply unit vector to acceleration tempAcceleration = vec_madd(tempUnitVector, tempAcceleration, zeroVector); //increment velocity value of particle with a*dt // need to explicitly call the array, since pDi is only a temp pass by value, doesn't change the particle particle_Array[i].velocity = vec_madd(tempAcceleration, tempDELATTIME, particle_Array[i].velocity); /* //Print velocity printf("Velocity %d: ", i ); printf("x= %f, y=%f, z=%f", pDi.velocity[0], pDi.velocity[1], pDi.velocity[2]); printf("\n"); */ /* printf("Particle %d: ", i ); printf("x= %f, y=%f, z=%f", pDi.velocity[0], pDi.velocity[1], pDi.velocity[2]); printf("\n"); */ //end of this loop } //printf("\n"); } //now that all the accelerations for all particles are calculated, //apply them and update velocity for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { //incrementing position with v*dt // vec_madd is awesome, it all gets done in one line! emulated the += operator, kinda, but more flexible particle_Array[i].position = vec_madd(particle_Array[i].velocity, tempDELATTIME, particle_Array[i].position); /* printf("Particle %d positions: ", i ); printf("x= %f, y=%f, z=%f", particle_Array[i].position[0], particle_Array[i].position[1], particle_Array[i].position[2]); printf("\n"); */ ///// ALL CODE BELOW THIS SHOULD ONLY BE RUN ON PPU \\\\\\\\\\\\\\\\\\ /////////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } //end of main loop /* printf("End of iteration %d ---> ",it_counter ); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); */ } /* printf("\n"); for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { printf("Particle %d final position: ", i ); printf("x= %f, y=%f, z=%f", particle_Array[i].position[0], particle_Array[i].position[1], particle_Array[i].position[2]); printf("\n"); printf("End of iteration %d ---> ",it_counter ); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); } */ printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); struct timeval end; gettimeofday(&end,NULL); float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f); printf("Execution time: %f\n",deltaTime); return 0; }
void lp_rast_triangle_32_3_16(struct lp_rasterizer_task *task, const union lp_rast_cmd_arg arg) { const struct lp_rast_triangle *tri = arg.triangle.tri; const struct lp_rast_plane *plane = GET_PLANES(tri); int x = (arg.triangle.plane_mask & 0xff) + task->x; int y = (arg.triangle.plane_mask >> 8) + task->y; unsigned i, j; struct { unsigned mask:16; unsigned i:8; unsigned j:8; } out[16]; unsigned nr = 0; __m128i p0 = lp_plane_to_m128i(&plane[0]); /* c, dcdx, dcdy, eo */ __m128i p1 = lp_plane_to_m128i(&plane[1]); /* c, dcdx, dcdy, eo */ __m128i p2 = lp_plane_to_m128i(&plane[2]); /* c, dcdx, dcdy, eo */ __m128i zero = vec_splats((unsigned char) 0); __m128i c; __m128i dcdx; __m128i dcdy; __m128i rej4; __m128i dcdx2; __m128i dcdx3; __m128i span_0; /* 0,dcdx,2dcdx,3dcdx for plane 0 */ __m128i span_1; /* 0,dcdx,2dcdx,3dcdx for plane 1 */ __m128i span_2; /* 0,dcdx,2dcdx,3dcdx for plane 2 */ __m128i unused; __m128i vshuf_mask0; __m128i vshuf_mask1; __m128i vshuf_mask2; #ifdef PIPE_ARCH_LITTLE_ENDIAN vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x03020100); vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x07060504); vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x0B0A0908); #else vshuf_mask0 = (__m128i) vec_splats((unsigned int) 0x0C0D0E0F); vshuf_mask1 = (__m128i) vec_splats((unsigned int) 0x08090A0B); vshuf_mask2 = (__m128i) vec_splats((unsigned int) 0x04050607); #endif transpose4_epi32(&p0, &p1, &p2, &zero, &c, &dcdx, &dcdy, &rej4); /* Adjust dcdx; */ dcdx = vec_sub_epi32(zero, dcdx); c = vec_add_epi32(c, vec_mullo_epi32(dcdx, (__m128i) vec_splats(x))); c = vec_add_epi32(c, vec_mullo_epi32(dcdy, (__m128i) vec_splats(y))); rej4 = vec_slli_epi32(rej4, 2); /* * Adjust so we can just check the sign bit (< 0 comparison), * instead of having to do a less efficient <= 0 comparison */ c = vec_sub_epi32(c, (__m128i) vec_splats((unsigned int) 1)); rej4 = vec_add_epi32(rej4, (__m128i) vec_splats((unsigned int) 1)); dcdx2 = vec_add_epi32(dcdx, dcdx); dcdx3 = vec_add_epi32(dcdx2, dcdx); transpose4_epi32(&zero, &dcdx, &dcdx2, &dcdx3, &span_0, &span_1, &span_2, &unused); for (i = 0; i < 4; i++) { __m128i cx = c; for (j = 0; j < 4; j++) { __m128i c4rej = vec_add_epi32(cx, rej4); __m128i rej_masks = vec_srai_epi32(c4rej, 31); /* if (is_zero(rej_masks)) */ if (vec_movemask_epi8(rej_masks) == 0) { __m128i c0_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask0), span_0); __m128i c1_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask1), span_1); __m128i c2_0 = vec_add_epi32(vec_perm(cx, cx, vshuf_mask2), span_2); __m128i c_0 = vec_or(vec_or(c0_0, c1_0), c2_0); __m128i c0_1 = vec_add_epi32(c0_0, vec_perm(dcdy, dcdy, vshuf_mask0)); __m128i c1_1 = vec_add_epi32(c1_0, vec_perm(dcdy, dcdy, vshuf_mask1)); __m128i c2_1 = vec_add_epi32(c2_0, vec_perm(dcdy, dcdy, vshuf_mask2)); __m128i c_1 = vec_or(vec_or(c0_1, c1_1), c2_1); __m128i c_01 = vec_packs_epi32(c_0, c_1); __m128i c0_2 = vec_add_epi32(c0_1, vec_perm(dcdy, dcdy, vshuf_mask0)); __m128i c1_2 = vec_add_epi32(c1_1, vec_perm(dcdy, dcdy, vshuf_mask1)); __m128i c2_2 = vec_add_epi32(c2_1, vec_perm(dcdy, dcdy, vshuf_mask2)); __m128i c_2 = vec_or(vec_or(c0_2, c1_2), c2_2); __m128i c0_3 = vec_add_epi32(c0_2, vec_perm(dcdy, dcdy, vshuf_mask0)); __m128i c1_3 = vec_add_epi32(c1_2, vec_perm(dcdy, dcdy, vshuf_mask1)); __m128i c2_3 = vec_add_epi32(c2_2, vec_perm(dcdy, dcdy, vshuf_mask2)); __m128i c_3 = vec_or(vec_or(c0_3, c1_3), c2_3); __m128i c_23 = vec_packs_epi32(c_2, c_3); __m128i c_0123 = vec_packs_epi16(c_01, c_23); unsigned mask = vec_movemask_epi8(c_0123); out[nr].i = i; out[nr].j = j; out[nr].mask = mask; if (mask != 0xffff) nr++; } cx = vec_add_epi32(cx, vec_slli_epi32(dcdx, 2)); } c = vec_add_epi32(c, vec_slli_epi32(dcdy, 2)); } for (i = 0; i < nr; i++) lp_rast_shade_quads_mask(task, &tri->inputs, x + 4 * out[i].j, y + 4 * out[i].i, 0xffff & ~out[i].mask); }
void gimp_composite_dodge_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d; vector unsigned char alpha_a,alpha_b,alpha; vector signed short ox0001=vec_splat_s16(1); union { vector signed short v; vector unsigned short vu; gushort u16[8]; } ah,al,bh,bl; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); ah.v=vec_unpackh((vector signed char)a); ah.v=vec_sl(ah.v,ox0008); al.v=vec_unpackl((vector signed char)a); al.v=vec_sl(al.v,ox0008); b=vec_nor(b,b); bh.v=vec_unpackh((vector signed char)b); bh.v=vec_and(bh.v,ox00ff); bh.v=vec_add(bh.v,ox0001); bl.v=vec_unpackl((vector signed char)b); bl.v=vec_and(bl.v,ox00ff); bl.v=vec_add(bl.v,ox0001); ah.u16[0]=ah.u16[0]/bh.u16[0]; ah.u16[1]=ah.u16[1]/bh.u16[1]; ah.u16[2]=ah.u16[2]/bh.u16[2]; ah.u16[4]=ah.u16[4]/bh.u16[4]; ah.u16[5]=ah.u16[5]/bh.u16[5]; ah.u16[6]=ah.u16[6]/bh.u16[6]; al.u16[0]=al.u16[0]/bl.u16[0]; al.u16[1]=al.u16[1]/bl.u16[1]; al.u16[2]=al.u16[2]/bl.u16[2]; al.u16[4]=al.u16[4]/bl.u16[4]; al.u16[5]=al.u16[5]/bl.u16[5]; al.u16[6]=al.u16[6]/bl.u16[6]; d=vec_packs(ah.vu,al.vu); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); ah.v=vec_unpackh((vector signed char)a); ah.v=vec_sl(ah.v,ox0008); al.v=vec_unpackl((vector signed char)a); al.v=vec_sl(al.v,ox0008); b=vec_nor(b,b); bh.v=vec_unpackh((vector signed char)b); bh.v=vec_and(bh.v,ox00ff); bh.v=vec_add(bh.v,ox0001); bl.v=vec_unpackl((vector signed char)b); bl.v=vec_and(bl.v,ox00ff); bl.v=vec_add(bl.v,ox0001); ah.u16[0]=ah.u16[0]/bh.u16[0]; ah.u16[1]=ah.u16[1]/bh.u16[1]; ah.u16[2]=ah.u16[2]/bh.u16[2]; ah.u16[4]=ah.u16[4]/bh.u16[4]; ah.u16[5]=ah.u16[5]/bh.u16[5]; ah.u16[6]=ah.u16[6]/bh.u16[6]; al.u16[0]=al.u16[0]/bl.u16[0]; al.u16[1]=al.u16[1]/bl.u16[1]; al.u16[2]=al.u16[2]/bl.u16[2]; al.u16[4]=al.u16[4]/bl.u16[4]; al.u16[5]=al.u16[5]/bl.u16[5]; al.u16[6]=al.u16[6]/bl.u16[6]; d=vec_packs(ah.vu,al.vu); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
void test1() { // CHECK-LABEL: define void @test1 res_vd = vec_add(vd, vd); // CHECK: fadd <2 x double> res_vd = vec_and(vbll, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vbll); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> res_vd = vec_and(vd, vd); // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vbll, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vbll); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_andc(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64> // CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_ceil(vd); // CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}}) res_vf = vec_ceil(vf); // CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}}) res_vbll = vec_cmpeq(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpeq(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpge(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpge(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmpgt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmpgt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmple(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmple(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) res_vbll = vec_cmplt(vd, vd); // CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}}) res_vbi = vec_cmplt(vf, vf); // CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}}) /* vec_div */ res_vf = vec_div(vf, vf); // CHECK: @llvm.ppc.vsx.xvdivsp res_vd = vec_div(vd, vd); // CHECK: @llvm.ppc.vsx.xvdivdp /* vec_max */ res_vf = vec_max(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp res_vd = vec_max(vd, vd); // CHECK: @llvm.ppc.vsx.xvmaxdp res_vf = vec_vmaxfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvmaxsp /* vec_min */ res_vf = vec_min(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_vd = vec_min(vd, vd); // CHECK: @llvm.ppc.vsx.xvmindp res_vf = vec_vminfp(vf, vf); // CHECK: @llvm.ppc.vsx.xvminsp res_d = __builtin_vsx_xsmaxdp(d, d); // CHECK: @llvm.ppc.vsx.xsmaxdp res_d = __builtin_vsx_xsmindp(d, d); // CHECK: @llvm.ppc.vsx.xsmindp /* vec_perm */ res_vsll = vec_perm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_perm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_perm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vsll = vec_vperm(vsll, vsll, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vull = vec_vperm(vull, vull, vuc); // CHECK: @llvm.ppc.altivec.vperm res_vd = vec_vperm(vd, vd, vuc); // CHECK: @llvm.ppc.altivec.vperm /* vec_vsx_ld */ res_vsi = vec_vsx_ld(0, &vsi); // CHECK: @llvm.ppc.vsx.lxvw4x res_vui = vec_vsx_ld(0, &vui); // CHECK: @llvm.ppc.vsx.lxvw4x res_vf = vec_vsx_ld (0, &vf); // CHECK: @llvm.ppc.vsx.lxvw4x res_vsll = vec_vsx_ld(0, &vsll); // CHECK: @llvm.ppc.vsx.lxvd2x res_vull = vec_vsx_ld(0, &vull); // CHECK: @llvm.ppc.vsx.lxvd2x res_vd = vec_vsx_ld(0, &vd); // CHECK: @llvm.ppc.vsx.lxvd2x /* vec_vsx_st */ vec_vsx_st(vsi, 0, &res_vsi); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vui, 0, &res_vui); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vf, 0, &res_vf); // CHECK: @llvm.ppc.vsx.stxvw4x vec_vsx_st(vsll, 0, &res_vsll); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vull, 0, &res_vull); // CHECK: @llvm.ppc.vsx.stxvd2x vec_vsx_st(vd, 0, &res_vd); // CHECK: @llvm.ppc.vsx.stxvd2x /* vec_and */ res_vsll = vec_and(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_and(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_and(vull, vull); // CHECK: and <2 x i64> res_vull = vec_and(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_and(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_and(vbll, vbll); // CHECK: and <2 x i64> /* vec_vand */ res_vsll = vec_vand(vsll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vbll, vsll); // CHECK: and <2 x i64> res_vsll = vec_vand(vsll, vbll); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vbll, vull); // CHECK: and <2 x i64> res_vull = vec_vand(vull, vbll); // CHECK: and <2 x i64> res_vbll = vec_vand(vbll, vbll); // CHECK: and <2 x i64> /* vec_andc */ res_vsll = vec_andc(vsll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vbll, vsll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vsll = vec_andc(vsll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vbll, vull); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vull = vec_andc(vull, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vbll = vec_andc(vbll, vbll); // CHECK: xor <2 x i64> // CHECK: and <2 x i64> res_vf = vec_floor(vf); // CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_floor(vd); // CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_madd(vf, vf, vf); // CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) res_vd = vec_madd(vd, vd, vd); // CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) res_vf = vec_msub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> res_vd = vec_msub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> res_vf = vec_mul(vf, vf); // CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}} res_vd = vec_mul(vd, vd); // CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_nearbyint(vf); // CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_nearbyint(vd); // CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_nmadd(vf, vf, vf); // CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}) // CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]] res_vd = vec_nmadd(vd, vd, vd); // CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}) // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] res_vf = vec_nmsub(vf, vf, vf); // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> // CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}} res_vd = vec_nmsub(vd, vd, vd); // CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}} // CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> // CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]] /* vec_nor */ res_vsll = vec_nor(vsll, vsll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vull, vull); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vull = vec_nor(vbll, vbll); // CHECK: or <2 x i64> // CHECK: xor <2 x i64> res_vd = vec_nor(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1> /* vec_or */ res_vsll = vec_or(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_or(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_or(vull, vull); // CHECK: or <2 x i64> res_vull = vec_or(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_or(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_or(vbll, vbll); // CHECK: or <2 x i64> res_vd = vec_or(vd, vd); // CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64> // CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_rint(vf); // CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_rint(vd); // CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}}) res_vf = vec_rsqrte(vf); // CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}}) res_vd = vec_rsqrte(vd); // CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}}) dummy(); // CHECK: call void @dummy() res_vf = vec_sel(vd, vd, vbll); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_sel(vd, vd, vull); // CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1> // CHECK: and <2 x i64> %{{[0-9]+}}, // CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: or <2 x i64> // CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double> res_vf = vec_sqrt(vf); // CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_sqrt(vd); // CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}}) res_vd = vec_sub(vd, vd); // CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}} res_vf = vec_trunc(vf); // CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}}) res_vd = vec_trunc(vd); // CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}}) /* vec_vor */ res_vsll = vec_vor(vsll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vbll, vsll); // CHECK: or <2 x i64> res_vsll = vec_vor(vsll, vbll); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vbll, vull); // CHECK: or <2 x i64> res_vull = vec_vor(vull, vbll); // CHECK: or <2 x i64> res_vbll = vec_vor(vbll, vbll); // CHECK: or <2 x i64> /* vec_xor */ res_vsll = vec_xor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_xor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_xor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_xor(vbll, vbll); // CHECK: xor <2 x i64> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vd, vbll); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> dummy(); // CHECK: call void @dummy() res_vd = vec_xor(vbll, vd); // CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}} // CHECK: bitcast <2 x i64> [[X1]] to <2 x double> /* vec_vxor */ res_vsll = vec_vxor(vsll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vbll, vsll); // CHECK: xor <2 x i64> res_vsll = vec_vxor(vsll, vbll); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vbll, vull); // CHECK: xor <2 x i64> res_vull = vec_vxor(vull, vbll); // CHECK: xor <2 x i64> res_vbll = vec_vxor(vbll, vbll); // CHECK: xor <2 x i64> }
void gimp_composite_grain_extract_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx) { const guchar *A = ctx->A; const guchar *B = ctx->B; guchar *D = ctx->D; guint length = ctx->n_pixels; vector unsigned char a,b,d,alpha_a,alpha_b,alpha; vector signed short ah,al,bh,bl; while (length >= 4) { a=LoadUnaligned(A); b=LoadUnaligned(B); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); ah=vec_unpackh((vector signed char)a); ah=vec_and(ah,ox00ff); al=vec_unpackl((vector signed char)a); al=vec_and(al,ox00ff); bh=vec_unpackh((vector signed char)b); bh=vec_and(bh,ox00ff); bl=vec_unpackl((vector signed char)b); bl=vec_and(bl,ox00ff); ah=vec_sub(ah,bh); al=vec_sub(al,bl); ah=vec_sub(ah,oxff80); al=vec_sub(al,oxff80); d=vec_packsu(ah,al); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnaligned(d, D); A+=16; B+=16; D+=16; length-=4; } /* process last pixels */ length = length*4; a=LoadUnalignedLess(A, length); b=LoadUnalignedLess(B, length); alpha_a=vec_and(a, alphamask); alpha_b=vec_and(b, alphamask); alpha=vec_min(alpha_a, alpha_b); ah=vec_unpackh((vector signed char)a); ah=vec_and(ah,ox00ff); al=vec_unpackl((vector signed char)a); al=vec_and(al,ox00ff); bh=vec_unpackh((vector signed char)b); bh=vec_and(bh,ox00ff); bl=vec_unpackl((vector signed char)b); bl=vec_and(bl,ox00ff); ah=vec_sub(ah,bh); al=vec_sub(al,bl); ah=vec_sub(ah,oxff80); al=vec_sub(al,oxff80); d=vec_packsu(ah,al); d=vec_andc(d, alphamask); d=vec_or(d, alpha); StoreUnalignedLess(d, D, length); }
void x264_zigzag_interleave_8x8_cavlc_altivec( int16_t *dst, int16_t *src, uint8_t *nnz ) { vec_s16_t tmpv[8]; vec_s16_t merge[2]; vec_s16_t permv[2]; vec_s16_t orv[4]; vec_s16_t src0v = vec_ld( 0*16, src ); vec_s16_t src1v = vec_ld( 1*16, src ); vec_s16_t src2v = vec_ld( 2*16, src ); vec_s16_t src3v = vec_ld( 3*16, src ); vec_s16_t src4v = vec_ld( 4*16, src ); vec_s16_t src5v = vec_ld( 5*16, src ); vec_s16_t src6v = vec_ld( 6*16, src ); vec_s16_t src7v = vec_ld( 7*16, src ); vec_u8_t pack; vec_u8_t nnzv = vec_vsx_ld( 0, nnz ); vec_u8_t shift = vec_splat_u8( 7 ); LOAD_ZERO; const vec_u8_t mask[3] = { { 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17 }, { 0x08, 0x09, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F, 0x18, 0x19, 0x1A, 0x1B, 0x1C, 0x1D, 0x1E, 0x1F }, { 0x10, 0x11, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, 0x12, 0x13, 0x0A, 0x0B, 0x0C, 0x0D, 0x0E, 0x0F } }; tmpv[0] = vec_mergeh( src0v, src1v ); tmpv[1] = vec_mergel( src0v, src1v ); tmpv[2] = vec_mergeh( src2v, src3v ); tmpv[3] = vec_mergel( src2v, src3v ); tmpv[4] = vec_mergeh( src4v, src5v ); tmpv[5] = vec_mergel( src4v, src5v ); tmpv[6] = vec_mergeh( src6v, src7v ); tmpv[7] = vec_mergel( src6v, src7v ); merge[0] = vec_mergeh( tmpv[0], tmpv[1] ); merge[1] = vec_mergeh( tmpv[2], tmpv[3] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[1] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 0*16, dst ); merge[0] = vec_mergeh( tmpv[4], tmpv[5] ); merge[1] = vec_mergeh( tmpv[6], tmpv[7] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[2] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 1*16, dst ); vec_st( permv[1], 2*16, dst ); vec_st( permv[2], 3*16, dst ); merge[0] = vec_mergel( tmpv[0], tmpv[1] ); merge[1] = vec_mergel( tmpv[2], tmpv[3] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[1] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 4*16, dst ); merge[0] = vec_mergel( tmpv[4], tmpv[5] ); merge[1] = vec_mergel( tmpv[6], tmpv[7] ); permv[0] = vec_perm( merge[0], merge[1], mask[0] ); permv[2] = vec_perm( merge[0], merge[1], mask[1] ); vec_st( permv[0], 5*16, dst ); vec_st( permv[1], 6*16, dst ); vec_st( permv[2], 7*16, dst ); orv[0] = vec_or( src0v, src1v ); orv[1] = vec_or( src2v, src3v ); orv[2] = vec_or( src4v, src5v ); orv[3] = vec_or( src6v, src7v ); permv[0] = vec_or( orv[0], orv[1] ); permv[1] = vec_or( orv[2], orv[3] ); permv[0] = vec_or( permv[0], permv[1] ); permv[1] = vec_perm( permv[0], permv[0], mask[1] ); permv[0] = vec_or( permv[0], permv[1] ); pack = (vec_u8_t)vec_packs( permv[0], permv[0] ); pack = (vec_u8_t)vec_cmpeq( pack, zerov ); pack = vec_nor( pack, zerov ); pack = vec_sr( pack, shift ); nnzv = vec_perm( nnzv, pack, mask[2] ); vec_st( nnzv, 0, nnz ); }
static int forward_engine(int do_full, const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_OMX *ox, float *opt_sc) { vector float mpv, dpv, ipv; /* previous row values */ vector float sv; /* temp storage of 1 curr row value in progress */ vector float dcv; /* delayed storage of D(i,q+1) */ vector float xEv; /* E state: keeps max for Mk->E as we go */ vector float xBv; /* B state: splatted vector of B[i-1] for B->Mk calculations */ vector float zerov; /* splatted 0.0's in a vector */ float xN, xE, xB, xC, xJ; /* special states' scores */ int i; /* counter over sequence positions 1..L */ int q; /* counter over quads 0..nq-1 */ int j; /* counter over DD iterations (4 is full serialization) */ int Q = p7O_NQF(om->M); /* segment length: # of vectors */ vector float *dpc = ox->dpf[0]; /* current row, for use in {MDI}MO(dpp,q) access macro */ vector float *dpp; /* previous row, for use in {MDI}MO(dpp,q) access macro */ vector float *rp; /* will point at om->rfv[x] for residue x[i] */ vector float *tp; /* will point into (and step thru) om->tfv */ /* Initialization. */ ox->M = om->M; ox->L = L; ox->has_own_scales = TRUE; /* all forward matrices control their own scalefactors */ zerov = (vector float) vec_splat_u32(0); for (q = 0; q < Q; q++) MMO(dpc,q) = IMO(dpc,q) = DMO(dpc,q) = zerov; xE = ox->xmx[p7X_E] = 0.; xN = ox->xmx[p7X_N] = 1.; xJ = ox->xmx[p7X_J] = 0.; xB = ox->xmx[p7X_B] = om->xf[p7O_N][p7O_MOVE]; xC = ox->xmx[p7X_C] = 0.; ox->xmx[p7X_SCALE] = 1.0; ox->totscale = 0.0; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, 0, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=0, width=8, precision=5*/ #endif for (i = 1; i <= L; i++) { dpp = dpc; dpc = ox->dpf[do_full * i]; /* avoid conditional, use do_full as kronecker delta */ rp = om->rfv[dsq[i]]; tp = om->tfv; dcv = (vector float) vec_splat_u32(0); xEv = (vector float) vec_splat_u32(0); xBv = esl_vmx_set_float(xB); /* Right shifts by 4 bytes. 4,8,12,x becomes x,4,8,12. Shift zeros on. */ mpv = vec_sld(zerov, MMO(dpp,Q-1), 12); dpv = vec_sld(zerov, DMO(dpp,Q-1), 12); ipv = vec_sld(zerov, IMO(dpp,Q-1), 12); for (q = 0; q < Q; q++) { /* Calculate new MMO(i,q); don't store it yet, hold it in sv. */ sv = (vector float) vec_splat_u32(0); sv = vec_madd(xBv, *tp, sv); tp++; sv = vec_madd(mpv, *tp, sv); tp++; sv = vec_madd(ipv, *tp, sv); tp++; sv = vec_madd(dpv, *tp, sv); tp++; sv = vec_madd(sv, *rp, zerov); rp++; xEv = vec_add(xEv, sv); /* Load {MDI}(i-1,q) into mpv, dpv, ipv; * {MDI}MX(q) is then the current, not the prev row */ mpv = MMO(dpp,q); dpv = DMO(dpp,q); ipv = IMO(dpp,q); /* Do the delayed stores of {MD}(i,q) now that memory is usable */ MMO(dpc,q) = sv; DMO(dpc,q) = dcv; /* Calculate the next D(i,q+1) partially: M->D only; * delay storage, holding it in dcv */ dcv = vec_madd(sv, *tp, zerov); tp++; /* Calculate and store I(i,q); assumes odds ratio for emission is 1.0 */ sv = vec_madd(mpv, *tp, zerov); tp++; IMO(dpc,q) = vec_madd(ipv, *tp, sv); tp++; } /* Now the DD paths. We would rather not serialize them but * in an accurate Forward calculation, we have few options. */ /* dcv has carried through from end of q loop above; store it * in first pass, we add M->D and D->D path into DMX */ /* We're almost certainly're obligated to do at least one complete * DD path to be sure: */ dcv = vec_sld(zerov, dcv, 12); DMO(dpc,0) = (vector float) vec_splat_u32(0); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { DMO(dpc,q) = vec_add(dcv, DMO(dpc,q)); dcv = vec_madd(DMO(dpc,q), *tp, zerov); tp++; /* extend DMO(q), so we include M->D and D->D paths */ } /* now. on small models, it seems best (empirically) to just go * ahead and serialize. on large models, we can do a bit better, * by testing for when dcv (DD path) accrued to DMO(q) is below * machine epsilon for all q, in which case we know DMO(q) are all * at their final values. The tradeoff point is (empirically) somewhere around M=100, * at least on my desktop. We don't worry about the conditional here; * it's outside any inner loops. */ if (om->M < 100) { /* Fully serialized version */ for (j = 1; j < 4; j++) { dcv = vec_sld(zerov, dcv, 12); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ for (q = 0; q < Q; q++) { /* note, extend dcv, not DMO(q); only adding DD paths now */ DMO(dpc,q) = vec_add(dcv, DMO(dpc,q)); dcv = vec_madd(dcv, *tp, zerov); tp++; } } } else { /* Slightly parallelized version, but which incurs some overhead */ for (j = 1; j < 4; j++) { vector bool int cv; /* keeps track of whether any DD's change DMO(q) */ dcv = vec_sld(zerov, dcv, 12); tp = om->tfv + 7*Q; /* set tp to start of the DD's */ cv = (vector bool int) vec_splat_u32(0); for (q = 0; q < Q; q++) { /* using cmpgt below tests if DD changed any DMO(q) *without* conditional branch */ sv = vec_add(dcv, DMO(dpc,q)); cv = vec_or(cv, vec_cmpgt(sv, DMO(dpc,q))); DMO(dpc,q) = sv; /* store new DMO(q) */ dcv = vec_madd(dcv, *tp, zerov); tp++; /* note, extend dcv, not DMO(q) */ } /* DD's didn't change any DMO(q)? Then done, break out. */ if (vec_all_eq(cv, (vector bool int)zerov)) break; } } /* Add D's to xEv */ for (q = 0; q < Q; q++) xEv = vec_add(DMO(dpc,q), xEv); /* Finally the "special" states, which start from Mk->E (->C, ->J->B) */ /* The following incantation is a horizontal sum of xEv's elements */ /* These must follow DD calculations, because D's contribute to E in Forward * (as opposed to Viterbi) */ xE = esl_vmx_hsum_float(xEv); xN = xN * om->xf[p7O_N][p7O_LOOP]; xC = (xC * om->xf[p7O_C][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_MOVE]); xJ = (xJ * om->xf[p7O_J][p7O_LOOP]) + (xE * om->xf[p7O_E][p7O_LOOP]); xB = (xJ * om->xf[p7O_J][p7O_MOVE]) + (xN * om->xf[p7O_N][p7O_MOVE]); /* and now xB will carry over into next i, and xC carries over after i=L */ /* Sparse rescaling. xE above threshold? trigger a rescaling event. */ if (xE > 1.0e4) /* that's a little less than e^10, ~10% of our dynamic range */ { xN = xN / xE; xC = xC / xE; xJ = xJ / xE; xB = xB / xE; xEv = esl_vmx_set_float(1.0 / xE); for (q = 0; q < Q; q++) { MMO(dpc,q) = vec_madd(MMO(dpc,q), xEv, zerov); DMO(dpc,q) = vec_madd(DMO(dpc,q), xEv, zerov); IMO(dpc,q) = vec_madd(IMO(dpc,q), xEv, zerov); } ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = xE; ox->totscale += log(xE); xE = 1.0; } else ox->xmx[i*p7X_NXCELLS+p7X_SCALE] = 1.0; /* Storage of the specials. We could've stored these already * but using xE, etc. variables makes it easy to convert this * code to O(M) memory versions just by deleting storage steps. */ ox->xmx[i*p7X_NXCELLS+p7X_E] = xE; ox->xmx[i*p7X_NXCELLS+p7X_N] = xN; ox->xmx[i*p7X_NXCELLS+p7X_J] = xJ; ox->xmx[i*p7X_NXCELLS+p7X_B] = xB; ox->xmx[i*p7X_NXCELLS+p7X_C] = xC; #if p7_DEBUGGING if (ox->debugging) p7_omx_DumpFBRow(ox, TRUE, i, 9, 5, xE, xN, xJ, xB, xC); /* logify=TRUE, <rowi>=i, width=8, precision=5*/ #endif } /* end loop over sequence residues 1..L */ /* finally C->T, and flip total score back to log space (nats) */ /* On overflow, xC is inf or nan (nan arises because inf*0 = nan). */ /* On an underflow (which shouldn't happen), we counterintuitively return infinity: * the effect of this is to force the caller to rescore us with full range. */ if (isnan(xC)) ESL_EXCEPTION(eslERANGE, "forward score is NaN"); else if (L>0 && xC == 0.0) ESL_EXCEPTION(eslERANGE, "forward score underflow (is 0.0)"); /* [J5/118] */ else if (isinf(xC) == 1) ESL_EXCEPTION(eslERANGE, "forward score overflow (is infinity)"); if (opt_sc != NULL) *opt_sc = ox->totscale + log(xC * om->xf[p7O_C][p7O_MOVE]); return eslOK; }
void jsimd_idct_islow_altivec (void * dct_table_, JCOEFPTR coef_block, JSAMPARRAY output_buf, JDIMENSION output_col) { short *dct_table = (short *)dct_table_; int *outptr; __vector short row0, row1, row2, row3, row4, row5, row6, row7, col0, col1, col2, col3, col4, col5, col6, col7, quant0, quant1, quant2, quant3, quant4, quant5, quant6, quant7, tmp0, tmp1, tmp2, tmp3, z3, z4, z34l, z34h, col71l, col71h, col26l, col26h, col53l, col53h, row71l, row71h, row26l, row26h, row53l, row53h, out0, out1, out2, out3, out4, out5, out6, out7; __vector int tmp0l, tmp0h, tmp1l, tmp1h, tmp2l, tmp2h, tmp3l, tmp3h, tmp10l, tmp10h, tmp11l, tmp11h, tmp12l, tmp12h, tmp13l, tmp13h, z3l, z3h, z4l, z4h, out0l, out0h, out1l, out1h, out2l, out2h, out3l, out3h, out4l, out4h, out5l, out5h, out6l, out6h, out7l, out7h; __vector signed char outb; /* Constants */ __vector short pw_zero = { __8X(0) }, pw_f130_f054 = { __4X2(F_0_541 + F_0_765, F_0_541) }, pw_f054_mf130 = { __4X2(F_0_541, F_0_541 - F_1_847) }, pw_mf078_f117 = { __4X2(F_1_175 - F_1_961, F_1_175) }, pw_f117_f078 = { __4X2(F_1_175, F_1_175 - F_0_390) }, pw_mf060_mf089 = { __4X2(F_0_298 - F_0_899, -F_0_899) }, pw_mf089_f060 = { __4X2(-F_0_899, F_1_501 - F_0_899) }, pw_mf050_mf256 = { __4X2(F_2_053 - F_2_562, -F_2_562) }, pw_mf256_f050 = { __4X2(-F_2_562, F_3_072 - F_2_562) }; __vector unsigned short pass1_bits = { __8X(PASS1_BITS) }; __vector int pd_zero = { __4X(0) }, pd_descale_p1 = { __4X(1 << (DESCALE_P1 - 1)) }, pd_descale_p2 = { __4X(1 << (DESCALE_P2 - 1)) }; __vector unsigned int descale_p1 = { __4X(DESCALE_P1) }, descale_p2 = { __4X(DESCALE_P2) }, const_bits = { __4X(CONST_BITS) }; __vector signed char pb_centerjsamp = { __16X(CENTERJSAMPLE) }; /* Pass 1: process columns */ col0 = vec_ld(0, coef_block); col1 = vec_ld(16, coef_block); col2 = vec_ld(32, coef_block); col3 = vec_ld(48, coef_block); col4 = vec_ld(64, coef_block); col5 = vec_ld(80, coef_block); col6 = vec_ld(96, coef_block); col7 = vec_ld(112, coef_block); tmp1 = vec_or(col1, col2); tmp2 = vec_or(col3, col4); tmp1 = vec_or(tmp1, tmp2); tmp3 = vec_or(col5, col6); tmp3 = vec_or(tmp3, col7); tmp1 = vec_or(tmp1, tmp3); quant0 = vec_ld(0, dct_table); col0 = vec_mladd(col0, quant0, pw_zero); if (vec_all_eq(tmp1, pw_zero)) { /* AC terms all zero */ col0 = vec_sl(col0, pass1_bits); row0 = vec_splat(col0, 0); row1 = vec_splat(col0, 1); row2 = vec_splat(col0, 2); row3 = vec_splat(col0, 3); row4 = vec_splat(col0, 4); row5 = vec_splat(col0, 5); row6 = vec_splat(col0, 6); row7 = vec_splat(col0, 7); } else { quant1 = vec_ld(16, dct_table); quant2 = vec_ld(32, dct_table); quant3 = vec_ld(48, dct_table); quant4 = vec_ld(64, dct_table); quant5 = vec_ld(80, dct_table); quant6 = vec_ld(96, dct_table); quant7 = vec_ld(112, dct_table); col1 = vec_mladd(col1, quant1, pw_zero); col2 = vec_mladd(col2, quant2, pw_zero); col3 = vec_mladd(col3, quant3, pw_zero); col4 = vec_mladd(col4, quant4, pw_zero); col5 = vec_mladd(col5, quant5, pw_zero); col6 = vec_mladd(col6, quant6, pw_zero); col7 = vec_mladd(col7, quant7, pw_zero); DO_IDCT(col, 1); TRANSPOSE(out, row); } /* Pass 2: process rows */ DO_IDCT(row, 2); TRANSPOSE(out, col); outb = vec_packs(col0, col0); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[0] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col1, col1); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[1] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col2, col2); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[2] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col3, col3); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[3] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col4, col4); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[4] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col5, col5); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[5] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col6, col6); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[6] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); outb = vec_packs(col7, col7); outb = vec_add(outb, pb_centerjsamp); outptr = (int *)(output_buf[7] + output_col); vec_ste((__vector int)outb, 0, outptr); vec_ste((__vector int)outb, 4, outptr); }
int main(int argc, char **argv) { // setup, assign particles initla positions and masses // this is done in scalar fashion, NOT SIMD // insignificant to performance since it's only done once //time_t startTime = time(NULL); //seed random generator srand( time(NULL) ); printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n"); int pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { int grideSize = GRID_SIZE; // printf("\n grideSize/2: %d", grideSize/2); float xPos = (float)( rand() % grideSize - grideSize/2); float yPos = (float)( rand() % grideSize - grideSize/2); float zPos = (float)( rand() % grideSize - grideSize/2); particle_Array_PPU[pC].position[0] = xPos; particle_Array_PPU[pC].position[1] = yPos; particle_Array_PPU[pC].position[2] = zPos; particle_Array_PPU[pC].velocity[3] = PARTICLES_DEFAULTMASS; if(pC == 0) { // center, high mass particle_Array_PPU[pC].position = zeroVector; particle_Array_PPU[pC].velocity = zeroVector; //initialVelocityVector_Y_minus; printf("Earth mass: %f\n", earthMass ); particle_Array_PPU[pC].velocity[3] = earthMass; // PARTICLES_DEFAULTMASS * 500.0f; } if(pC == 1) { particle_Array_PPU[pC].position = issPosition; //initPositionVector; particle_Array_PPU[pC].velocity = issVelocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = issMass; //PARTICLES_DEFAULTMASS * 500.0f; } if(pC == 2) { particle_Array_PPU[pC].position = sat1Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat1Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 3) { particle_Array_PPU[pC].position = sat2Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat2Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 4) { particle_Array_PPU[pC].position = sat3Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat3Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 5) { particle_Array_PPU[pC].position = sat4Position; //initPositionVector; particle_Array_PPU[pC].velocity = sat4Velocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = satMass; } if(pC == 6) { particle_Array_PPU[pC].position = moonPosition; //initPositionVector; particle_Array_PPU[pC].velocity = moonVelocity; //initialVelocityVector_Y; particle_Array_PPU[pC].velocity[3] = moonMass; } else { } //particle_Array_PPU[pC].position = vec_splat(particle_Array_PPU[pC].position, 1); //particle_Array_PPU[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster printf("Particle %d: ", pC ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]); printf("\n"); } // copy arrays into spe ones pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { spe1_Data[pC] = particle_Array_PPU[pC]; spe2_Data[pC] = particle_Array_PPU[pC]; spe3_Data[pC] = particle_Array_PPU[pC]; spe4_Data[pC] = particle_Array_PPU[pC]; spe5_Data[pC] = particle_Array_PPU[pC]; spe6_Data[pC] = particle_Array_PPU[pC]; } for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array_PPU[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } i=0; printf("\n"); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); int speCount = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES,-1); /* printf("\n"); printf("%d", speCount); printf("\n"); printf("\n"); printf("--------------\n"); printf("Starting spe1 part\n"); */ /* // wait for user input, gives time to start graphics printf("Press Enter to continue\n"); getchar(); */ struct timeval start; gettimeofday(&start,NULL); int iterCount = 0; for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++) { //printf("++++++++++++++ START of ITERATION # %d of %d +++++++++++++++\n", i, ITERATION_COUNT ); int retval; pthread_t spe1_Thread; pthread_t spe2_Thread; pthread_t spe3_Thread; pthread_t spe4_Thread; pthread_t spe5_Thread; pthread_t spe6_Thread; //speData = spe1_Data; speNumber = 0; /* Create Thread */ // printf("spe1_Data value: %d\n", (int)spe1_Data ); retval = pthread_create(&spe1_Thread, // Thread object NULL, // Thread attributes spe_code_launch_1, // Thread function NULL // Thread argument ); // printf("spe2_Data value: %d\n", (int)spe2_Data ); retval = pthread_create(&spe2_Thread, // Thread object NULL, // Thread attributes spe_code_launch_2, // Thread function NULL // Thread argument ); retval = pthread_create(&spe3_Thread, // Thread object NULL, // Thread attributes spe_code_launch_3, // Thread function NULL // Thread argument ); retval = pthread_create(&spe4_Thread, // Thread object NULL, // Thread attributes spe_code_launch_4, // Thread function NULL // Thread argument ); retval = pthread_create(&spe5_Thread, // Thread object NULL, // Thread attributes spe_code_launch_5, // Thread function NULL // Thread argument ); retval = pthread_create(&spe6_Thread, // Thread object NULL, // Thread attributes spe_code_launch_6, // Thread function NULL // Thread argument ); //Wait for Thread Completion retval = pthread_join(spe1_Thread, NULL); retval = pthread_join(spe2_Thread, NULL); retval = pthread_join(spe3_Thread, NULL); retval = pthread_join(spe4_Thread, NULL); retval = pthread_join(spe5_Thread, NULL); retval = pthread_join(spe6_Thread, NULL); speNumber = 1; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe1_Data[i]; } speNumber = 2; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe2_Data[i]; } speNumber = 3; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe3_Data[i]; } speNumber = 4; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe4_Data[i]; } speNumber = 5; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i) { particle_Array_PPU[i] = spe5_Data[i]; } speNumber = 6; for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<PARTICLES_MAXCOUNT; ++i) { particle_Array_PPU[i] = spe6_Data[i]; } // reset spe counter speNumber = 0; // copy arrays into spe ones pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { spe1_Data[pC] = particle_Array_PPU[pC]; spe2_Data[pC] = particle_Array_PPU[pC]; spe3_Data[pC] = particle_Array_PPU[pC]; spe4_Data[pC] = particle_Array_PPU[pC]; spe5_Data[pC] = particle_Array_PPU[pC]; spe6_Data[pC] = particle_Array_PPU[pC]; // update values for shared array (graphics) /* particle_Array_Shared[pC].position[0] = particle_Array_PPU[pC].position[0]; particle_Array_Shared[pC].position[1] = particle_Array_PPU[pC].position[1]; particle_Array_Shared[pC].position[2] = particle_Array_PPU[pC].position[2]; particle_Array_Shared[pC].position[3] = particle_Array_PPU[pC].position[3]; */ /* printf("Particle %d positions: ", pC ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]); printf("\n"); */ fullSimilationData[iterCount].particleArray[pC]= particle_Array_PPU[pC]; } // printf("++++++++++++++ END of ITERATION # %d of %d +++++++++++++++\n", iterCount, ITERATION_COUNT ); } struct timeval end; gettimeofday(&end,NULL); float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f); printf("print out values from post spe calculations\n"); i = 0; for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { printf("Particle %d positions: ", i ); printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[i].position[0], particle_Array_PPU[i].position[1], particle_Array_PPU[i].position[2], particle_Array_PPU[i].velocity[3]); printf("\n"); } //cleaining the array octantCount = resetOctantCount; for(i = 0; i<PARTICLES_MAXCOUNT; ++i) { /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes // compare with zero vector to get on which side of each axis the particle is // 0 is negative, 1 is positive side of the axis __vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector); // need to manually set, can't cast due to size difference error __vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0], (unsigned int)axisDirection[1], (unsigned int)axisDirection[2], 0}; // need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES shiftedAxis = vec_andc(oneVector, shiftedAxis); /* printf("Particle %d axis sign: ", i ); printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]); printf("\n"); */ // shift 3 axies simultaneously (actually only 2, 1 stays in origina positon //, with intent to OR them later shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector __vector unsigned int axis_Y = vec_splats(shiftedAxis[1]); __vector unsigned int axis_Z = vec_splats(shiftedAxis[2]); // merge shhifted x y z values by OR-ing // this gives the octant id, range from 0-7 (000 to 111 in binary) shiftedAxis = vec_or(shiftedAxis, axis_Y); shiftedAxis = vec_or(shiftedAxis, axis_Z); // insert octant value into last slot of position vector of particle particle_Array_PPU[i].position[3] = (float)shiftedAxis[0]; //printf("Oct ID: %d \n", shiftedAxis[0]); /////// Update octant vector by incrementing octant that the particle is in // The only possible non SIMD line in the entire program, //irreleant since quadrant counting should occur on PPU anyways octantCount[shiftedAxis[0]] ++ ; } i=0; printf("\n"); printf("Particle disttribution across the octants: \n"); printf("O0: %d O1: %d O2: %d O3: %d O4: %d O5: %d O6: %d O7: %d\n", octantCount[0], octantCount[1], octantCount[2], octantCount[3], octantCount[4], octantCount[5], octantCount[6], octantCount[7]); printf("\n"); /* time_t endTime = time(NULL); int deltaTime = endTime - startTime; */ // need to look into http://www.xmlsoft.org/ printf("Execution time: %f\n",deltaTime); FILE *filePointer; filePointer = fopen("fileLog1.txt","w"); //fprintf(filePointer, "<SimulationData>\n"); iterCount = 0; for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++) { //printf("Iteration: %d\n", iterCount); //fprintf(filePointer,"<Iter>\n"); fprintf(filePointer,"\n"); pC = 0; for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC) { //printf("Particle %d positions: ", pC ); // fprintf(filePointer,"<Obj>\n"); //printf("x= %f, y=%f, z=%f", fullSimilationData[iterCount].particleArray[pC].position[0], fullSimilationData[iterCount].particleArray[pC].position[1], fullSimilationData[iterCount].particleArray[pC].position[2]); //printf("\n"); /* fprintf(filePointer,"<PX>%f</PX>\n",fullSimilationData[iterCount].particleArray[pC].position[0]); fprintf(filePointer,"<PY>%f</PY>\n",fullSimilationData[iterCount].particleArray[pC].position[1]); fprintf(filePointer,"<PZ>%f</PZ>\n",fullSimilationData[iterCount].particleArray[pC].position[2]); */ fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[0]); fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[1]); fprintf(filePointer,"%f",fullSimilationData[iterCount].particleArray[pC].position[2]); fprintf(filePointer,"|"); //fprintf(filePointer,"</Obj>\n"); //fullSimilationData[fullDataCounter].particleArray[pC]= particle_Array_PPU[pC]; } //fprintf(filePointer,"</Iter>\n"); } //fprintf(filePointer, "</SimulationData>\n"); fclose(filePointer); return 0; }