Пример #1
0
int main ()
{
  *vecfloat++ = vec_andc(vecint[0], vecfloat[1]);
  *vecfloat++ = vec_andc(vecfloat[0], vecint[1]);
  *vecfloat++ = vec_vxor(vecint[0], vecfloat[1]);
  *vecfloat++ = vec_vxor(vecfloat[0], vecint[1]);
  *varpixel++ = vec_packpx(vecuint[0], vecuint[1]);
  *varpixel++ = vec_vpkpx(vecuint[0], vecuint[1]);
  *vecshort++ = vec_vmulosb(vecchar[0], vecchar[1]);
  *vecint++ = vec_ld(var_int[0], longp[1]);
  *vecint++ = vec_lde(var_int[0], longp[1]);
  *vecint++ = vec_ldl(var_int[0], longp[1]);
  *vecint++ = vec_lvewx(var_int[0], longp[1]);
  *vecint++ = vec_unpackh(vecshort[0]);
  *vecint++ = vec_unpackl(vecshort[0]);
  *vecushort++ = vec_andc(vecshort[0], vecushort[1]);
  *vecushort++ = vec_andc(vecushort[0], vecshort[1]);
  *vecushort++ = vec_vxor(vecshort[0], vecushort[1]);
  *vecushort++ = vec_vxor(vecushort[0], vecshort[1]);
  *vecuint++ = vec_ld(var_int[0], ulongp[1]);
  *vecuint++ = vec_lvx(var_int[0], ulongp[1]);
  *vecuint++ = vec_vmsumubm(vecuchar[0], vecuchar[1], vecuint[2]);
  *vecuchar++ = vec_xor(vecuchar[0], vecchar[1]);

  return 0;
}
Пример #2
0
void
gimp_composite_multiply_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d,alpha_a,alpha_b,alpha;
  vector unsigned short al,ah;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      al=vec_mule(a,b);
      al=vec_add(al,ox0080);
      ah=vec_mulo(a,b);
      ah=vec_add(ah,ox0080);
      al=vec_add(al,vec_sr(al,ox0008));
      ah=vec_add(ah,vec_sr(ah,ox0008));
      d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      alpha=vec_min(alpha_a, alpha_b);

      d=vec_andc(d, alphamask);
      d=vec_or(d, alpha);

      StoreUnaligned(d, D);

      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  /* process last pixels */
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  al=vec_mule(a,b);
  al=vec_add(al,ox0080);
  ah=vec_mulo(a,b);
  ah=vec_add(ah,ox0080);
  al=vec_add(al,vec_sr(al,ox0008));
  ah=vec_add(ah,vec_sr(ah,ox0008));
  d=vec_perm((vector unsigned char)al,(vector unsigned char)ah,combine_high_bytes);

  alpha_a=vec_and(a, alphamask);
  alpha_b=vec_and(b, alphamask);
  alpha=vec_min(alpha_a, alpha_b);

  d=vec_andc(d, alphamask);
  d=vec_or(d, alpha);

  StoreUnalignedLess(d, D, length);
}
Пример #3
0
void
gimp_composite_difference_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d,e,alpha_a,alpha_b;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      d=vec_min(alpha_a, alpha_b);

      a=vec_andc(a, alphamask);
      a=vec_adds(a, d);
      b=vec_andc(b, alphamask);
      d=vec_subs(a, b);
      e=vec_subs(b, a);
      d=vec_add(d,e);

      StoreUnaligned(d, D);

      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  /* process last pixels */
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  alpha_a=vec_and(a,alphamask);
  alpha_b=vec_and(b,alphamask);
  d=vec_min(alpha_a,alpha_b);

  a=vec_andc(a,alphamask);
  a=vec_adds(a,d);
  b=vec_andc(b,alphamask);
  d=vec_subs(a,b);
  e=vec_subs(b, a);
  d=vec_add(d,e);

  StoreUnalignedLess(d, D, length);
}
Пример #4
0
vector signed int
test2_andc (vector signed int x, vector bool int y)
{
  vector signed int *foo;
  *foo += vec_andc (x, y);
  return *foo;
}
Пример #5
0
vector signed short
test2_andc (vector signed short x, vector bool short y)
{
  vector signed short *foo;
  *foo += vec_andc (x, y);
  return *foo;
}
Пример #6
0
vector unsigned short
test6_andc (vector unsigned short x, vector unsigned short y)
{
  vector unsigned short *foo;
  *foo += vec_andc (x, y);
  return *foo;
}
Пример #7
0
vector unsigned int
test6_andc (vector unsigned int x, vector unsigned int y)
{
  vector unsigned int *foo;
  *foo += vec_andc (x, y);
  return *foo;
}
Пример #8
0
void foo (vector bool long long *vblr,
	  vector double *vdr, vector unsigned long long *vullz,
	  vector double *vdz, vector bool char *vbcz,
	  vector signed char *vscz, vector unsigned char *vucz,
	  vector bool int *vbiz, vector int *viz,
	  vector unsigned int *vuiz, vector signed long long int *vslliz,
	  vector bool short int *vbsiz, vector signed short int *vssiz,
	  vector unsigned short int *vusiz, vector float *vfz)
{
  *vblr++ = vec_andc (vbla, vblb);
  *vdr++  = vec_double (vslla);
  *vdr++  = vec_double (vulla);

  *vblr++ = vec_mergeh (vbla, vblb);
  *vblr++ = vec_mergel (vbla, vblb);
  *vblr++ = vec_nor (vbla, vblb);
  *vblr++ = vec_or (vbla, vblb);
  *vblr++ = vec_sel (vbla, vblb, vblc);
  *vblr++ = vec_sel (vbla, vblb, vullc);
  *vblr++ = vec_xor (vbla, vblb);

  *vullz++ = vec_sel (vulla, vullb, vbllc);
  *vullz++ = vec_sel (vulla, vullb, vullc);

  *vdz++ = vec_sel(vda, vdb, vullc);

  *vbcz++ = vec_sel (vbca, vbcb, vbcc);
  *vbcz++ = vec_sel (vbca, vbcb, vucc);
  *vbcz++ = vec_xor (vbca, vbcb);
  *vscz++ = vec_sel (vsca, vscb, vbcc);
  *vscz++ = vec_sel (vsca, vscb, vucc);
  *vucz++ = vec_sel (vuca, vucb, vbcc);
  *vucz++ = vec_sel (vuca, vucb, vucc);

  *vbiz++ = vec_sel (vbia, vbib, vbic);
  *vbiz++ = vec_sel (vbia, vbib, vuic);
  *vbiz++ = vec_xor (vbia, vbib);
  *viz++ = vec_sel (vsia, vsib, vbic);
  *viz++ = vec_sel (vsia, vsib, vuic);
  *vuiz++ = vec_sel (vuia, vuib, vbic);
  *vuiz++ = vec_sel (vuia, vuib, vuic);

  *vslliz++ = vec_sel(vslla, vsllb, vbllc);
  *vslliz++ = vec_sel(vslla, vsllb, vullc);

  *vssiz++ = vec_sel(vssia, vssib, vbsic);
  *vssiz++ = vec_sel(vssia, vssib, vusic);
  *vusiz++ = vec_sel(vusia, vusib, vbsic);
  *vusiz++ = vec_sel(vusia, vusib, vusic);

  *vbsiz++ = vec_sel (vbsia, vbsib, vbsic);
  *vbsiz++ = vec_sel (vbsia, vbsib, vusic);
  *vbsiz++ = vec_xor (vbsia, vbsib);

  *vdz++ = vec_sel (vda, vdb, vbllc);
  *vfz++ = vec_sel (vfa, vfb, vbic);
  *vfz++ = vec_sel (vfa, vfb, vuic);
  *vfz++ = vec_xor (vfa, vfb);
}
Пример #9
0
void foo (vector bool long long *vblr,
	  vector double *vdr)
{
  *vblr++ = vec_andc (vbla, vblb);
  *vdr++  = vec_double (vsla);
  *vdr++  = vec_double (vula);
  *vblr++ = vec_mergeh (vbla, vblb);
  *vblr++ = vec_mergel (vbla, vblb);
  *vblr++ = vec_nor (vbla, vblb);
  *vblr++ = vec_or (vbla, vblb);
  *vblr++ = vec_sel (vbla, vblb, vblc);
  *vblr++ = vec_sel (vbla, vblb, vulc);
  *vblr++ = vec_xor (vbla, vblb);
}
Пример #10
0
static void vorbis_inverse_coupling_altivec(float *mag, float *ang,
                                            intptr_t blocksize)
{
    int i;
    vector float m, a;
    vector bool int t0, t1;
    const vector unsigned int v_31 = //XXX
        vec_add(vec_add(vec_splat_u32(15),vec_splat_u32(15)),vec_splat_u32(1));
    for (i = 0; i < blocksize; i += 4) {
        m = vec_ld(0, mag+i);
        a = vec_ld(0, ang+i);
        t0 = vec_cmple(m, (vector float)vec_splat_u32(0));
        t1 = vec_cmple(a, (vector float)vec_splat_u32(0));
        a = vec_xor(a, (vector float) vec_sl((vector unsigned int)t0, v_31));
        t0 = (vector bool int)vec_and(a, t1);
        t1 = (vector bool int)vec_andc(a, t1);
        a = vec_sub(m, (vector float)t1);
        m = vec_add(m, (vector float)t0);
        vec_stl(a, 0, ang+i);
        vec_stl(m, 0, mag+i);
    }
}
Пример #11
0
static vector unsigned int funny()
{
  vector unsigned int a;
  return vec_andc(vec_add(a,a),vec_add(a,a));
}
Пример #12
0
int main ()
{
  vector float fa = {1.0, 2.0, 3.0, -4.0};
  vector float fb = {-2.0, -3.0, -4.0, -5.0};
  vector float fc = vec_cpsgn (fa, fb);

  vector long long la = {5L, 14L};
  vector long long lb = {3L, 86L};
  vector long long lc = vec_and (la, lb);
  vector bool long long ld = {0, -1};
  vector long long le = vec_and (la, ld);
  vector long long lf = vec_and (ld, lb);

  vector unsigned long long ua = {5L, 14L};
  vector unsigned long long ub = {3L, 86L};
  vector unsigned long long uc = vec_and (ua, ub);
  vector bool long long ud = {0, -1};
  vector unsigned long long ue = vec_and (ua, ud);
  vector unsigned long long uf = vec_and (ud, ub);

  vector long long lg = vec_andc (la, lb);
  vector long long lh = vec_andc (la, ld);
  vector long long li = vec_andc (ld, lb);

  vector unsigned long long ug = vec_andc (ua, ub);
  vector unsigned long long uh = vec_andc (ua, ud);
  vector unsigned long long ui = vec_andc (ud, ub);

  vector double da = {1.0, -4.0};
  vector double db = {-2.0, 5.0};
  vector double dc = vec_cpsgn (da, db);

  vector long long lj = vec_mergeh (la, lb);
  vector long long lk = vec_mergeh (la, ld);
  vector long long ll = vec_mergeh (ld, la);

  vector unsigned long long uj = vec_mergeh (ua, ub);
  vector unsigned long long uk = vec_mergeh (ua, ud);
  vector unsigned long long ul = vec_mergeh (ud, ua);

  vector long long lm = vec_mergel (la, lb);
  vector long long ln = vec_mergel (la, ld);
  vector long long lo = vec_mergel (ld, la);

  vector unsigned long long um = vec_mergel (ua, ub);
  vector unsigned long long un = vec_mergel (ua, ud);
  vector unsigned long long uo = vec_mergel (ud, ua);

  vector long long lp = vec_nor (la, lb);
  vector long long lq = vec_nor (la, ld);
  vector long long lr = vec_nor (ld, la);

  vector unsigned long long up = vec_nor (ua, ub);
  vector unsigned long long uq = vec_nor (ua, ud);
  vector unsigned long long ur = vec_nor (ud, ua);

  vector long long ls = vec_or (la, lb);
  vector long long lt = vec_or (la, ld);
  vector long long lu = vec_or (ld, la);

  vector unsigned long long us = vec_or (ua, ub);
  vector unsigned long long ut = vec_or (ua, ud);
  vector unsigned long long uu = vec_or (ud, ua);

  vector unsigned char ca = {0,4,8,1,5,9,2,6,10,3,7,11,15,12,14,13};
  vector long long lv = vec_perm (la, lb, ca);
  vector unsigned long long uv = vec_perm (ua, ub, ca);

  vector long long lw = vec_sel (la, lb, lc);
  vector long long lx = vec_sel (la, lb, uc);
  vector long long ly = vec_sel (la, lb, ld);

  vector unsigned long long uw = vec_sel (ua, ub, lc);
  vector unsigned long long ux = vec_sel (ua, ub, uc);
  vector unsigned long long uy = vec_sel (ua, ub, ld);

  vector long long lz = vec_xor (la, lb);
  vector long long l0 = vec_xor (la, ld);
  vector long long l1 = vec_xor (ld, la);

  vector unsigned long long uz = vec_xor (ua, ub);
  vector unsigned long long u0 = vec_xor (ua, ud);
  vector unsigned long long u1 = vec_xor (ud, ua);

  int ia = vec_all_eq (ua, ub);
  int ib = vec_all_ge (ua, ub);
  int ic = vec_all_gt (ua, ub);
  int id = vec_all_le (ua, ub);
  int ie = vec_all_lt (ua, ub);
  int ig = vec_all_ne (ua, ub);

  int ih = vec_any_eq (ua, ub);
  int ii = vec_any_ge (ua, ub);
  int ij = vec_any_gt (ua, ub);
  int ik = vec_any_le (ua, ub);
  int il = vec_any_lt (ua, ub);
  int im = vec_any_ne (ua, ub);

  vector int sia = {9, 16, 25, 36};
  vector int sib = {-8, -27, -64, -125};
  vector int sic = vec_mergee (sia, sib);
  vector int sid = vec_mergeo (sia, sib);

  vector unsigned int uia = {9, 16, 25, 36};
  vector unsigned int uib = {8, 27, 64, 125};
  vector unsigned int uic = vec_mergee (uia, uib);
  vector unsigned int uid = vec_mergeo (uia, uib);

  vector bool int bia = {0, -1, -1, 0};
  vector bool int bib = {-1, -1, 0, -1};
  vector bool int bic = vec_mergee (bia, bib);
  vector bool int bid = vec_mergeo (bia, bib);

  vector unsigned int uie = vec_packsu (ua, ub);

  vector long long l2 = vec_cntlz (la);
  vector unsigned long long u2 = vec_cntlz (ua);
  vector int sie = vec_cntlz (sia);
  vector unsigned int uif = vec_cntlz (uia);
  vector short ssa = {20, -40, -60, 80, 100, -120, -140, 160};
  vector short ssb = vec_cntlz (ssa);
  vector unsigned short usa = {81, 72, 63, 54, 45, 36, 27, 18};
  vector unsigned short usb = vec_cntlz (usa);
  vector signed char sca = {-4, 3, -9, 15, -31, 31, 0, 0,
		            1, 117, -36, 99, 98, 97, 96, 95};
  vector signed char scb = vec_cntlz (sca);
  vector unsigned char cb = vec_cntlz (ca);

  vector double dd = vec_xl (0, &y);
  vec_xst (dd, 0, &z);

  vector double de = vec_round (dd);

  vector double df = vec_splat (de, 0);
  vector double dg = vec_splat (de, 1);
  vector long long l3 = vec_splat (l2, 0);
  vector long long l4 = vec_splat (l2, 1);
  vector unsigned long long u3 = vec_splat (u2, 0);
  vector unsigned long long u4 = vec_splat (u2, 1);
  vector bool long long l5 = vec_splat (ld, 0);
  vector bool long long l6 = vec_splat (ld, 1);

  vector long long l7 = vec_div (l3, l4);
  vector unsigned long long u5 = vec_div (u3, u4);

  vector long long l8 = vec_mul (l3, l4);
  vector unsigned long long u6 = vec_mul (u3, u4);

  vector double dh = vec_ctf (la, -2);
  vector double di = vec_ctf (ua, 2);
  vector long long l9 = vec_cts (dh, -2);
  vector unsigned long long u7 = vec_ctu (di, 2);

  return 0;
}
Пример #13
0
void test1() {
// CHECK-LABEL: define void @test1
// CHECK-LE-LABEL: define void @test1

  res_vf = vec_abs(vf);
// CHECK: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x float> @llvm.fabs.v4f32(<4 x float> %{{[0-9]*}})

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_add(vd, vd);
// CHECK: fadd <2 x double>
// CHECK-LE: fadd <2 x double>

  res_vd = vec_and(vbll, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vbll);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_andc(vbll, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_andc(vd, vbll);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>
// CHECK-LE: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK-LE: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_andc(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_ceil(vd);
// CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})

  res_vf = vec_ceil(vf);
// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpeq(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpeq(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpge(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpge(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpgt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpgt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmple(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmple(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmplt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})
// CHECK-LE: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmplt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})
// CHECK-LE: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  /* vec_cpsgn */
  res_vf = vec_cpsgn(vf, vf);
// CHECK: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}})
// CHECK-LE: call <4 x float> @llvm.copysign.v4f32(<4 x float> %{{.+}}, <4 x float> %{{.+}})

  res_vd = vec_cpsgn(vd, vd);
// CHECK: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}})
// CHECK-LE: call <2 x double> @llvm.copysign.v2f64(<2 x double> %{{.+}}, <2 x double> %{{.+}})

  /* vec_div */
  res_vsll = vec_div(vsll, vsll);
// CHECK: sdiv <2 x i64>
// CHECK-LE: sdiv <2 x i64>

  res_vull = vec_div(vull, vull);
// CHECK: udiv <2 x i64>
// CHECK-LE: udiv <2 x i64>

  res_vf = vec_div(vf, vf);
// CHECK: fdiv <4 x float>
// CHECK-LE: fdiv <4 x float>

  res_vd = vec_div(vd, vd);
// CHECK: fdiv <2 x double>
// CHECK-LE: fdiv <2 x double>

  /* vec_max */
  res_vf = vec_max(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp
// CHECK-LE: @llvm.ppc.vsx.xvmaxsp

  res_vd = vec_max(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmaxdp
// CHECK-LE: @llvm.ppc.vsx.xvmaxdp

  res_vf = vec_vmaxfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp
// CHECK-LE: @llvm.ppc.vsx.xvmaxsp

  /* vec_min */
  res_vf = vec_min(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp
// CHECK-LE: @llvm.ppc.vsx.xvminsp

  res_vd = vec_min(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmindp
// CHECK-LE: @llvm.ppc.vsx.xvmindp

  res_vf = vec_vminfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp
// CHECK-LE: @llvm.ppc.vsx.xvminsp

  res_d = __builtin_vsx_xsmaxdp(d, d);
// CHECK: @llvm.ppc.vsx.xsmaxdp
// CHECK-LE: @llvm.ppc.vsx.xsmaxdp

  res_d = __builtin_vsx_xsmindp(d, d);
// CHECK: @llvm.ppc.vsx.xsmindp
// CHECK-LE: @llvm.ppc.vsx.xsmindp

  /* vec_perm */
  res_vsll = vec_perm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_perm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vbll = vec_perm(vbll, vbll, vuc);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vf = vec_round(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float>
// CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float>

  res_vd = vec_round(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double>
// CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double>

  res_vd = vec_perm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vd = vec_splat(vd, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x double> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vbll = vec_splat(vbll, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vsll =  vec_splat(vsll, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vull =  vec_splat(vull, 1);
// CHECK: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>
// CHECK-LE: xor <16 x i8>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: [[T2:%.+]] = bitcast <2 x i64> {{.+}} to <4 x i32>
// CHECK-LE: call <4 x i32> @llvm.ppc.altivec.vperm(<4 x i32> [[T1]], <4 x i32> [[T2]], <16 x i8>

  res_vsi = vec_pack(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vui = vec_pack(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vbi = vec_pack(vbll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_vperm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_vperm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vd = vec_vperm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_vsx_ld */

  res_vsi = vec_vsx_ld(0, &vsi);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vui = vec_vsx_ld(0, &vui);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vf = vec_vsx_ld (0, &vf);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsll = vec_vsx_ld(0, &vsll);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x
// CHECK-LE: @llvm.ppc.vsx.lxvd2x

  res_vss = vec_vsx_ld(0, &vss);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vss = vec_vsx_ld(0, &ss);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vus = vec_vsx_ld(0, &vus);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vus = vec_vsx_ld(0, &us);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vbc = vec_vsx_ld(0, &vbc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsc = vec_vsx_ld(0, &vsc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vuc = vec_vsx_ld(0, &vuc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vsc = vec_vsx_ld(0, &sc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  res_vuc = vec_vsx_ld(0, &uc);
// CHECK: @llvm.ppc.vsx.lxvw4x
// CHECK-LE: @llvm.ppc.vsx.lxvw4x

  /* vec_vsx_st */

  vec_vsx_st(vsi, 0, &res_vsi);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsi, 0, &res_si);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_vui);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_ui);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vf, 0, &res_vf);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsll, 0, &res_vsll);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vull, 0, &res_vull);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vd, 0, &res_vd);
// CHECK: @llvm.ppc.vsx.stxvd2x
// CHECK-LE: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vss, 0, &res_vss);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vss, 0, &res_ss);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vus, 0, &res_vus);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vus, 0, &res_us);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsc, 0, &res_vsc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsc, 0, &res_sc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vuc, 0, &res_vuc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vuc, 0, &res_uc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_vbc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_sc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vbc, 0, &res_uc);
// CHECK: @llvm.ppc.vsx.stxvw4x
// CHECK-LE: @llvm.ppc.vsx.stxvw4x

  /* vec_and */
  res_vsll = vec_and(vsll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_and(vbll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_and(vsll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vull, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vbll, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_and(vull, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_and(vbll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  /* vec_vand */
  res_vsll = vec_vand(vsll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_vand(vbll, vsll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_vand(vsll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vull, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vbll, vull);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_vand(vull, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_vand(vbll, vbll);
// CHECK: and <2 x i64>
// CHECK-LE: and <2 x i64>

  /* vec_andc */
  res_vsll = vec_andc(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_andc(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vsll = vec_andc(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vull, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vull = vec_andc(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vbll = vec_andc(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>
// CHECK-LE: xor <2 x i64>
// CHECK-LE: and <2 x i64>

  res_vf = vec_floor(vf);
// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_floor(vd);
// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_madd(vf, vf, vf);
// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})

  res_vd = vec_madd(vd, vd, vd);
// CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})

  /* vec_mergeh */
  res_vsll = vec_mergeh(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergeh(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergeh(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vull, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergeh(vbll, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_mergel */
  res_vsll = vec_mergel(vsll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergel(vsll, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vsll = vec_mergel(vbll, vsll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vull, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vull, vbll);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  res_vull = vec_mergel(vbll, vull);
// CHECK: @llvm.ppc.altivec.vperm
// CHECK-LE: @llvm.ppc.altivec.vperm

  /* vec_msub */
  res_vf = vec_msub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>

  res_vd = vec_msub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>

  res_vsll = vec_mul(vsll, vsll);
// CHECK: mul <2 x i64>
// CHECK-LE: mul <2 x i64>

  res_vull = vec_mul(vull, vull);
// CHECK: mul <2 x i64>
// CHECK-LE: mul <2 x i64>

  res_vf = vec_mul(vf, vf);
// CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_mul(vd, vd);
// CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_nearbyint(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_nearbyint(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_nmadd(vf, vf, vf);
// CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]
// CHECK-LE: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-LE-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]

  res_vd = vec_nmadd(vd, vd, vd);
// CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]
// CHECK-LE: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  res_vf = vec_nmsub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK-LE: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}

  res_vd = vec_nmsub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]
// CHECK-LE: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-LE-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-LE-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  /* vec_nor */
  res_vsll = vec_nor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_nor(vull, vull);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_nor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>
// CHECK-LE: or <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vd = vec_nor(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>
// CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>

  /* vec_or */
  res_vsll = vec_or(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_or(vbll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_or(vsll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vull, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vbll, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_or(vull, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vbll = vec_or(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vd = vec_or(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_or(vbll, vd);
// CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]]
// CHECK: bitcast <2 x i64> [[T2]] to <2 x double>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[T2:%.+]] = or <2 x i64> %{{[0-9]+}}, [[T1]]
// CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double>

  res_vd = vec_or(vd, vbll);
// CHECK: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[T2]] to <2 x double>
// CHECK-LE: [[T1:%.+]] = bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK-LE: [[T2:%.+]] = or <2 x i64> [[T1]], %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[T2]] to <2 x double>

  res_vf = vec_re(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>
// CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvresp(<4 x float>

  res_vd = vec_re(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>
// CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvredp(<2 x double>

  res_vf = vec_rint(vf);
// CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_rint(vd);
// CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_rsqrte(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})

  res_vd = vec_rsqrte(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vf = vec_sel(vd, vd, vbll);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>
// CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64> %{{[0-9]+}},
// CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: or <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_sel(vd, vd, vull);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>
// CHECK-LE: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK-LE: and <2 x i64> %{{[0-9]+}},
// CHECK-LE: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: or <2 x i64>
// CHECK-LE: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  res_vf = vec_sqrt(vf);
// CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_sqrt(vd);
// CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})

  res_vd = vec_sub(vd, vd);
// CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_trunc(vf);
// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})
// CHECK-LE: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_trunc(vd);
// CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})
// CHECK-LE: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})

  /* vec_vor */
  res_vsll = vec_vor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_vor(vbll, vsll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vsll = vec_vor(vsll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vull, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vbll, vull);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vull = vec_vor(vull, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  res_vbll = vec_vor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK-LE: or <2 x i64>

  /* vec_xor */
  res_vsll = vec_xor(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_xor(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_xor(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vull, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_xor(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vbll = vec_xor(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vd, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vd, vbll);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()
// CHECK-LE: call void @dummy()

  res_vd = vec_xor(vbll, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>
// CHECK-LE: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-LE: bitcast <2 x i64> [[X1]] to <2 x double>

  /* vec_vxor */
  res_vsll = vec_vxor(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_vxor(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_vxor(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vull, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vull = vec_vxor(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vbll = vec_vxor(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK-LE: xor <2 x i64>

  res_vsll = vec_cts(vd, 0);
// CHECK: fmul <2 x double>
// CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_cts(vd, 31);
// CHECK: fmul <2 x double>
// CHECK: fptosi <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptosi <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_ctu(vd, 0);
// CHECK: fmul <2 x double>
// CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64>

  res_vsll = vec_ctu(vd, 31);
// CHECK: fmul <2 x double>
// CHECK: fptoui <2 x double> %{{.*}} to <2 x i64>
// CHECK-LE: fmul <2 x double>
// CHECK-LE: fptoui <2 x double> %{{.*}} to <2 x i64>

  res_vd = vec_ctf(vsll, 0);
// CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vsll, 31);
// CHECK: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: sitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vull, 0);
// CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>

  res_vd = vec_ctf(vull, 31);
// CHECK: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK: fmul <2 x double>
// CHECK-LE: uitofp <2 x i64> %{{.*}} to <2 x double>
// CHECK-LE: fmul <2 x double>
}
Пример #14
0
void iquant_intra_m1_altivec(IQUANT_INTRA_PDECL)
{
    int i;
    vector signed short vsrc;
    uint16_t *qmat;
    vector unsigned short vqmat;
    vector unsigned short vmquant;
    vector bool short eqzero, ltzero;
    vector signed short val, t0;
    vector signed short zero, one;
    vector unsigned int four;
    vector signed short min, max;
    int offset, offset2;
    int16_t dst0;
    union {
	vector unsigned short vu16;
	unsigned short mquant;
	vector signed int vs32;
	struct {
	    signed int pad[3];
	    signed int sum;
	} s;
    } vu;
#ifdef ALTIVEC_DST
    DataStreamControl dsc;
#endif

#ifdef ALTIVEC_VERIFY /* {{{ */
    if (NOT_VECTOR_ALIGNED(wsp->intra_q_mat))
	mjpeg_error_exit1("iquant_intra_m1: wsp->intra_q_mat %% 16 != 0, (%d)",
	    wsp->intra_q_mat);

    if (NOT_VECTOR_ALIGNED(src))
	mjpeg_error_exit1("iquant_intra_m1: src %% 16 != 0, (%d)", src);

    if (NOT_VECTOR_ALIGNED(dst))
	mjpeg_error_exit1("iquant_intra_m1: dst %% 16 != 0, (%d)", dst);

    for (i = 0; i < 64; i++)
	if (src[i] < -256 || src[i] > 255)
	    mjpeg_error_exit1("iquant_intra_m2: -256 > src[%i] > 255, (%d)",
		i, src[i]);
#endif /* }}} */

    AMBER_START;

    dst0 = src[0] << (3 - dc_prec);

    qmat = (uint16_t*)wsp->intra_q_mat;

#ifdef ALTIVEC_DST
    dsc.control = DATA_STREAM_CONTROL(64/8,1,0);
    vec_dst(src, dsc.control, 0);
    vec_dst(qmat, dsc.control, 1);
#endif

    /* vmquant = (vector unsigned short)(mquant); */
    vu.mquant = (unsigned short)mquant;
    vmquant = vec_splat(vu.vu16, 0);

    zero = vec_splat_s16(0);
    one = vec_splat_s16(1);
    four = vec_splat_u32(4);
    /* max = (2047); min = (-2048); {{{ */
    vu8(max) = vec_splat_u8(0x7);
    t0 = vec_splat_s16(-1); /* 0xffff */
    vu8(max) = vec_mergeh(vu8(max), vu8(t0)); /* 0x07ff == 2047 */
    min = vec_sub(t0, max);
    /* }}} */
    offset = 0;

#if 1
    vsrc = vec_ld(offset, (signed short*)src);
    vqmat = vec_ld(offset, (unsigned short*)qmat);
    i = (64/8) - 1;
    do {
	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	offset2 = offset;
	offset += 8*sizeof(int16_t);
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset2, dst);
    } while (--i);
    /* intra_q[i] * mquant */
    vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

    /* save sign */
    ltzero = vec_cmplt(vsrc, zero);
    eqzero = vec_cmpeq(vsrc, zero);

    /* val = abs(src) */
    t0 = vec_sub(zero, vsrc);
    val = vec_max(t0, vsrc);

    /* val = (src * quant) >> 4 */
    vs32(t0) = vec_mule(val, vs16(vqmat));
    vs32(val) = vec_mulo(val, vs16(vqmat));
    vs32(t0) = vec_sra(vs32(t0), four);
    vs16(t0) = vec_pack(vs32(t0), vs32(t0));
    vs32(val) = vec_sra(vs32(val), four);
    vs16(val) = vec_pack(vs32(val), vs32(val));
    val = vec_mergeh(vs16(t0), vs16(val));

    /* val = val - 1&~(val|val==0) */
    t0 = vec_or(val, eqzero);
    t0 = vec_andc(one, t0);
    val = vec_sub(val, t0);

    /* restore sign */
    t0 = vec_sub(zero, val);
    val = vec_sel(val, t0, ltzero);

    /* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
    val = vec_min(val, max);
    val = vec_max(val, min);

    vec_st(val, offset, dst);
#else
    /* {{{ */
    i = (64/8);
    do {
	vsrc = vec_ld(offset, (signed short*)src);
	vqmat = vec_ld(offset, (unsigned short*)qmat);

	/* intra_q[i] * mquant */
	vu16(vqmat) = vec_mulo(vu8(vqmat), vu8(vmquant));

	/* save sign */
	ltzero = vec_cmplt(vsrc, zero);
	eqzero = vec_cmpeq(vsrc, zero);

	/* val = abs(src) */
	t0 = vec_sub(zero, vsrc);
	val = vec_max(t0, vsrc);

	/* val = (src * quant) >> 4 */
	vs32(t0) = vec_mule(val, vs16(vqmat));
	vs32(val) = vec_mulo(val, vs16(vqmat));
	vs32(t0) = vec_sra(vs32(t0), four);
	vs16(t0) = vec_pack(vs32(t0), vs32(t0));
	vs32(val) = vec_sra(vs32(val), four);
	vs16(val) = vec_pack(vs32(val), vs32(val));
	val = vec_mergeh(vs16(t0), vs16(val));

	/* val = val - 1&~(val|val==0) */
	t0 = vec_or(val, eqzero);
	t0 = vec_andc(one, t0);
	val = vec_sub(val, t0);

	/* restore sign */
	t0 = vec_sub(zero, val);
	val = vec_sel(val, t0, ltzero);

	/* val = (val > 2047) ? ((val < -2048) ? -2048 : val); */
	val = vec_min(val, max);
	val = vec_max(val, min);

	vec_st(val, offset, dst);

	offset += 8*sizeof(int16_t);
    } while (--i);
    /* }}} */
#endif

    dst[0] = dst0;

    AMBER_STOP;
}
Пример #15
0
void test1() {
// CHECK-LABEL: define void @test1

  res_vd = vec_add(vd, vd);
// CHECK: fadd <2 x double>

  res_vd = vec_and(vbll, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vbll);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  res_vd = vec_and(vd, vd);
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_andc(vbll, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_andc(vd, vbll);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_andc(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]*}} to <2 x i64>
// CHECK: xor <2 x i64> %{{[0-9]*}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]*}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_ceil(vd);
// CHECK: call <2 x double> @llvm.ceil.v2f64(<2 x double> %{{[0-9]*}})

  res_vf = vec_ceil(vf);
// CHECK: call <4 x float> @llvm.ceil.v4f32(<4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpeq(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpeqdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpeq(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpeqsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpge(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpge(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmpgt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmpgt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmple(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgedp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmple(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgesp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  res_vbll = vec_cmplt(vd, vd);
// CHECK: call <2 x i64> @llvm.ppc.vsx.xvcmpgtdp(<2 x double> %{{[0-9]*}}, <2 x double> %{{[0-9]*}})

  res_vbi = vec_cmplt(vf, vf);
// CHECK: call <4 x i32> @llvm.ppc.vsx.xvcmpgtsp(<4 x float> %{{[0-9]*}}, <4 x float> %{{[0-9]*}})

  /* vec_div */
  res_vf = vec_div(vf, vf);
// CHECK: @llvm.ppc.vsx.xvdivsp

  res_vd = vec_div(vd, vd);
// CHECK: @llvm.ppc.vsx.xvdivdp

  /* vec_max */
  res_vf = vec_max(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp

  res_vd = vec_max(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmaxdp

  res_vf = vec_vmaxfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvmaxsp

  /* vec_min */
  res_vf = vec_min(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp

  res_vd = vec_min(vd, vd);
// CHECK: @llvm.ppc.vsx.xvmindp

  res_vf = vec_vminfp(vf, vf);
// CHECK: @llvm.ppc.vsx.xvminsp

  res_d = __builtin_vsx_xsmaxdp(d, d);
// CHECK: @llvm.ppc.vsx.xsmaxdp

  res_d = __builtin_vsx_xsmindp(d, d);
// CHECK: @llvm.ppc.vsx.xsmindp

  /* vec_perm */
  res_vsll = vec_perm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm

  res_vull = vec_perm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm

  res_vd = vec_perm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm

  res_vsll = vec_vperm(vsll, vsll, vuc);
// CHECK: @llvm.ppc.altivec.vperm

  res_vull = vec_vperm(vull, vull, vuc);
// CHECK: @llvm.ppc.altivec.vperm

  res_vd = vec_vperm(vd, vd, vuc);
// CHECK: @llvm.ppc.altivec.vperm

  /* vec_vsx_ld */

  res_vsi = vec_vsx_ld(0, &vsi);
// CHECK: @llvm.ppc.vsx.lxvw4x

  res_vui = vec_vsx_ld(0, &vui);
// CHECK: @llvm.ppc.vsx.lxvw4x

  res_vf = vec_vsx_ld (0, &vf);
// CHECK: @llvm.ppc.vsx.lxvw4x

  res_vsll = vec_vsx_ld(0, &vsll);
// CHECK: @llvm.ppc.vsx.lxvd2x

  res_vull = vec_vsx_ld(0, &vull);
// CHECK: @llvm.ppc.vsx.lxvd2x

  res_vd = vec_vsx_ld(0, &vd);
// CHECK: @llvm.ppc.vsx.lxvd2x

  /* vec_vsx_st */

  vec_vsx_st(vsi, 0, &res_vsi);
// CHECK: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vui, 0, &res_vui);
// CHECK: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vf, 0, &res_vf);
// CHECK: @llvm.ppc.vsx.stxvw4x

  vec_vsx_st(vsll, 0, &res_vsll);
// CHECK: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vull, 0, &res_vull);
// CHECK: @llvm.ppc.vsx.stxvd2x

  vec_vsx_st(vd, 0, &res_vd);
// CHECK: @llvm.ppc.vsx.stxvd2x

  /* vec_and */
  res_vsll = vec_and(vsll, vsll);
// CHECK: and <2 x i64>

  res_vsll = vec_and(vbll, vsll);
// CHECK: and <2 x i64>

  res_vsll = vec_and(vsll, vbll);
// CHECK: and <2 x i64>

  res_vull = vec_and(vull, vull);
// CHECK: and <2 x i64>

  res_vull = vec_and(vbll, vull);
// CHECK: and <2 x i64>

  res_vull = vec_and(vull, vbll);
// CHECK: and <2 x i64>

  res_vbll = vec_and(vbll, vbll);
// CHECK: and <2 x i64>

  /* vec_vand */
  res_vsll = vec_vand(vsll, vsll);
// CHECK: and <2 x i64>

  res_vsll = vec_vand(vbll, vsll);
// CHECK: and <2 x i64>

  res_vsll = vec_vand(vsll, vbll);
// CHECK: and <2 x i64>

  res_vull = vec_vand(vull, vull);
// CHECK: and <2 x i64>

  res_vull = vec_vand(vbll, vull);
// CHECK: and <2 x i64>

  res_vull = vec_vand(vull, vbll);
// CHECK: and <2 x i64>

  res_vbll = vec_vand(vbll, vbll);
// CHECK: and <2 x i64>

  /* vec_andc */
  res_vsll = vec_andc(vsll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vsll = vec_andc(vbll, vsll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vsll = vec_andc(vsll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vull = vec_andc(vull, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vull = vec_andc(vbll, vull);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vull = vec_andc(vull, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vbll = vec_andc(vbll, vbll);
// CHECK: xor <2 x i64>
// CHECK: and <2 x i64>

  res_vf = vec_floor(vf);
// CHECK: call <4 x float> @llvm.floor.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_floor(vd);
// CHECK: call <2 x double> @llvm.floor.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_madd(vf, vf, vf);
// CHECK: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})

  res_vd = vec_madd(vd, vd, vd);
// CHECK: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})

  res_vf = vec_msub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>

  res_vd = vec_msub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>

  res_vf = vec_mul(vf, vf);
// CHECK: fmul <4 x float> %{{[0-9]+}}, %{{[0-9]+}}

  res_vd = vec_mul(vd, vd);
// CHECK: fmul <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_nearbyint(vf);
// CHECK: call <4 x float> @llvm.round.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_nearbyint(vd);
// CHECK: call <2 x double> @llvm.round.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_nmadd(vf, vf, vf);
// CHECK: [[FM:[0-9]+]] = call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}})
// CHECK-NEXT: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %[[FM]]

  res_vd = vec_nmadd(vd, vd, vd);
// CHECK: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}})
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  res_vf = vec_nmsub(vf, vf, vf);
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: call <4 x float> @llvm.fma.v4f32(<4 x float> %{{[0-9]+}}, <4 x float> %{{[0-9]+}}, <4 x float>
// CHECK: fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %{{[0-9]+}}

  res_vd = vec_nmsub(vd, vd, vd);
// CHECK: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %{{[0-9]+}}
// CHECK-NEXT: [[FM:[0-9]+]] = call <2 x double> @llvm.fma.v2f64(<2 x double> %{{[0-9]+}}, <2 x double> %{{[0-9]+}}, <2 x double>
// CHECK-NEXT: fsub <2 x double> <double -0.000000e+00, double -0.000000e+00>, %[[FM]]

  /* vec_nor */
  res_vsll = vec_nor(vsll, vsll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>

  res_vull = vec_nor(vull, vull);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>

  res_vull = vec_nor(vbll, vbll);
// CHECK: or <2 x i64>
// CHECK: xor <2 x i64>

  res_vd = vec_nor(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: [[OR:%.+]] = or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK-NEXT: xor <2 x i64> [[OR]], <i64 -1, i64 -1>

  /* vec_or */
  res_vsll = vec_or(vsll, vsll);
// CHECK: or <2 x i64>

  res_vsll = vec_or(vbll, vsll);
// CHECK: or <2 x i64>

  res_vsll = vec_or(vsll, vbll);
// CHECK: or <2 x i64>

  res_vull = vec_or(vull, vull);
// CHECK: or <2 x i64>

  res_vull = vec_or(vbll, vull);
// CHECK: or <2 x i64>

  res_vull = vec_or(vull, vbll);
// CHECK: or <2 x i64>

  res_vbll = vec_or(vbll, vbll);
// CHECK: or <2 x i64>

  res_vd = vec_or(vd, vd);
// CHECK: bitcast <2 x double> %{{[0-9]+}} to <2 x i64>
// CHECK: or <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_rint(vf);
// CHECK: call <4 x float> @llvm.nearbyint.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_rint(vd);
// CHECK: call <2 x double> @llvm.nearbyint.v2f64(<2 x double> %{{[0-9]+}})

  res_vf = vec_rsqrte(vf);
// CHECK: call <4 x float> @llvm.ppc.vsx.xvrsqrtesp(<4 x float> %{{[0-9]+}})

  res_vd = vec_rsqrte(vd);
// CHECK: call <2 x double> @llvm.ppc.vsx.xvrsqrtedp(<2 x double> %{{[0-9]+}})

  dummy();
// CHECK: call void @dummy()

  res_vf = vec_sel(vd, vd, vbll);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_sel(vd, vd, vull);
// CHECK: xor <2 x i64> %{{[0-9]+}}, <i64 -1, i64 -1>
// CHECK: and <2 x i64> %{{[0-9]+}},
// CHECK: and <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: or <2 x i64>
// CHECK: bitcast <2 x i64> %{{[0-9]+}} to <2 x double>

  res_vf = vec_sqrt(vf);
// CHECK: call <4 x float> @llvm.sqrt.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_sqrt(vd);
// CHECK: call <2 x double> @llvm.sqrt.v2f64(<2 x double> %{{[0-9]+}})

  res_vd = vec_sub(vd, vd);
// CHECK: fsub <2 x double> %{{[0-9]+}}, %{{[0-9]+}}

  res_vf = vec_trunc(vf);
// CHECK: call <4 x float> @llvm.trunc.v4f32(<4 x float> %{{[0-9]+}})

  res_vd = vec_trunc(vd);
// CHECK: call <2 x double> @llvm.trunc.v2f64(<2 x double> %{{[0-9]+}})

  /* vec_vor */
  res_vsll = vec_vor(vsll, vsll);
// CHECK: or <2 x i64>

  res_vsll = vec_vor(vbll, vsll);
// CHECK: or <2 x i64>

  res_vsll = vec_vor(vsll, vbll);
// CHECK: or <2 x i64>

  res_vull = vec_vor(vull, vull);
// CHECK: or <2 x i64>

  res_vull = vec_vor(vbll, vull);
// CHECK: or <2 x i64>

  res_vull = vec_vor(vull, vbll);
// CHECK: or <2 x i64>

  res_vbll = vec_vor(vbll, vbll);
// CHECK: or <2 x i64>

  /* vec_xor */
  res_vsll = vec_xor(vsll, vsll);
// CHECK: xor <2 x i64>

  res_vsll = vec_xor(vbll, vsll);
// CHECK: xor <2 x i64>

  res_vsll = vec_xor(vsll, vbll);
// CHECK: xor <2 x i64>

  res_vull = vec_xor(vull, vull);
// CHECK: xor <2 x i64>

  res_vull = vec_xor(vbll, vull);
// CHECK: xor <2 x i64>

  res_vull = vec_xor(vull, vbll);
// CHECK: xor <2 x i64>

  res_vbll = vec_xor(vbll, vbll);
// CHECK: xor <2 x i64>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_xor(vd, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_xor(vd, vbll);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>

  dummy();
// CHECK: call void @dummy()

  res_vd = vec_xor(vbll, vd);
// CHECK: [[X1:%.+]] = xor <2 x i64> %{{[0-9]+}}, %{{[0-9]+}}
// CHECK: bitcast <2 x i64> [[X1]] to <2 x double>

  /* vec_vxor */
  res_vsll = vec_vxor(vsll, vsll);
// CHECK: xor <2 x i64>

  res_vsll = vec_vxor(vbll, vsll);
// CHECK: xor <2 x i64>

  res_vsll = vec_vxor(vsll, vbll);
// CHECK: xor <2 x i64>

  res_vull = vec_vxor(vull, vull);
// CHECK: xor <2 x i64>

  res_vull = vec_vxor(vbll, vull);
// CHECK: xor <2 x i64>

  res_vull = vec_vxor(vull, vbll);
// CHECK: xor <2 x i64>

  res_vbll = vec_vxor(vbll, vbll);
// CHECK: xor <2 x i64>

}
Пример #16
0
void
gimp_composite_dodge_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d;
  vector unsigned char alpha_a,alpha_b,alpha;
  vector signed short ox0001=vec_splat_s16(1);
  union
    {
      vector signed short v;
      vector unsigned short vu;
      gushort u16[8];
    } ah,al,bh,bl;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      alpha=vec_min(alpha_a, alpha_b);

      ah.v=vec_unpackh((vector signed char)a);
      ah.v=vec_sl(ah.v,ox0008);
      al.v=vec_unpackl((vector signed char)a);
      al.v=vec_sl(al.v,ox0008);

      b=vec_nor(b,b);
      bh.v=vec_unpackh((vector signed char)b);
      bh.v=vec_and(bh.v,ox00ff);
      bh.v=vec_add(bh.v,ox0001);
      bl.v=vec_unpackl((vector signed char)b);
      bl.v=vec_and(bl.v,ox00ff);
      bl.v=vec_add(bl.v,ox0001);

      ah.u16[0]=ah.u16[0]/bh.u16[0];
      ah.u16[1]=ah.u16[1]/bh.u16[1];
      ah.u16[2]=ah.u16[2]/bh.u16[2];
      ah.u16[4]=ah.u16[4]/bh.u16[4];
      ah.u16[5]=ah.u16[5]/bh.u16[5];
      ah.u16[6]=ah.u16[6]/bh.u16[6];

      al.u16[0]=al.u16[0]/bl.u16[0];
      al.u16[1]=al.u16[1]/bl.u16[1];
      al.u16[2]=al.u16[2]/bl.u16[2];
      al.u16[4]=al.u16[4]/bl.u16[4];
      al.u16[5]=al.u16[5]/bl.u16[5];
      al.u16[6]=al.u16[6]/bl.u16[6];

      d=vec_packs(ah.vu,al.vu);

      d=vec_andc(d, alphamask);
      d=vec_or(d, alpha);

      StoreUnaligned(d, D);
      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  alpha_a=vec_and(a, alphamask);
  alpha_b=vec_and(b, alphamask);
  alpha=vec_min(alpha_a, alpha_b);

  ah.v=vec_unpackh((vector signed char)a);
  ah.v=vec_sl(ah.v,ox0008);
  al.v=vec_unpackl((vector signed char)a);
  al.v=vec_sl(al.v,ox0008);

  b=vec_nor(b,b);
  bh.v=vec_unpackh((vector signed char)b);
  bh.v=vec_and(bh.v,ox00ff);
  bh.v=vec_add(bh.v,ox0001);
  bl.v=vec_unpackl((vector signed char)b);
  bl.v=vec_and(bl.v,ox00ff);
  bl.v=vec_add(bl.v,ox0001);

  ah.u16[0]=ah.u16[0]/bh.u16[0];
  ah.u16[1]=ah.u16[1]/bh.u16[1];
  ah.u16[2]=ah.u16[2]/bh.u16[2];
  ah.u16[4]=ah.u16[4]/bh.u16[4];
  ah.u16[5]=ah.u16[5]/bh.u16[5];
  ah.u16[6]=ah.u16[6]/bh.u16[6];

  al.u16[0]=al.u16[0]/bl.u16[0];
  al.u16[1]=al.u16[1]/bl.u16[1];
  al.u16[2]=al.u16[2]/bl.u16[2];
  al.u16[4]=al.u16[4]/bl.u16[4];
  al.u16[5]=al.u16[5]/bl.u16[5];
  al.u16[6]=al.u16[6]/bl.u16[6];

  d=vec_packs(ah.vu,al.vu);

  d=vec_andc(d, alphamask);
  d=vec_or(d, alpha);

  StoreUnalignedLess(d, D, length);
}
Пример #17
0
void
gimp_composite_grain_extract_rgba8_rgba8_rgba8_altivec (GimpCompositeContext *ctx)
{
  const guchar *A = ctx->A;
  const guchar *B = ctx->B;
  guchar *D = ctx->D;
  guint length = ctx->n_pixels;
  vector unsigned char a,b,d,alpha_a,alpha_b,alpha;
  vector signed short ah,al,bh,bl;

  while (length >= 4)
    {
      a=LoadUnaligned(A);
      b=LoadUnaligned(B);

      alpha_a=vec_and(a, alphamask);
      alpha_b=vec_and(b, alphamask);
      alpha=vec_min(alpha_a, alpha_b);

      ah=vec_unpackh((vector signed char)a);
      ah=vec_and(ah,ox00ff);
      al=vec_unpackl((vector signed char)a);
      al=vec_and(al,ox00ff);
      bh=vec_unpackh((vector signed char)b);
      bh=vec_and(bh,ox00ff);
      bl=vec_unpackl((vector signed char)b);
      bl=vec_and(bl,ox00ff);

      ah=vec_sub(ah,bh);
      al=vec_sub(al,bl);
      ah=vec_sub(ah,oxff80);
      al=vec_sub(al,oxff80);

      d=vec_packsu(ah,al);

      d=vec_andc(d, alphamask);
      d=vec_or(d, alpha);

      StoreUnaligned(d, D);

      A+=16;
      B+=16;
      D+=16;
      length-=4;
    }
  /* process last pixels */
  length = length*4;
  a=LoadUnalignedLess(A, length);
  b=LoadUnalignedLess(B, length);

  alpha_a=vec_and(a, alphamask);
  alpha_b=vec_and(b, alphamask);
  alpha=vec_min(alpha_a, alpha_b);

  ah=vec_unpackh((vector signed char)a);
  ah=vec_and(ah,ox00ff);
  al=vec_unpackl((vector signed char)a);
  al=vec_and(al,ox00ff);
  bh=vec_unpackh((vector signed char)b);
  bh=vec_and(bh,ox00ff);
  bl=vec_unpackl((vector signed char)b);
  bl=vec_and(bl,ox00ff);

  ah=vec_sub(ah,bh);
  al=vec_sub(al,bl);
  ah=vec_sub(ah,oxff80);
  al=vec_sub(al,oxff80);

  d=vec_packsu(ah,al);

  d=vec_andc(d, alphamask);
  d=vec_or(d, alpha);

  StoreUnalignedLess(d, D, length);
}
int main(int argc, char **argv)
{

	time_t startTime = time(NULL);


// setup, assign particles initla positions and masses
// this is done in scalar fashion, NOT SIMD
// insignificant to performance since it's only done once

	struct timeval start;
	gettimeofday(&start,NULL);

	//seed random generator
	srand( time(NULL) );

	printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n");

	int pC = 0;
	for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	{
		int grideSize = GRID_SIZE;

	//	printf("\n grideSize/2: %d", grideSize/2);

		float xPos = (float)( rand() % grideSize  - grideSize/2);
		float yPos = (float)( rand() % grideSize  - grideSize/2);
		float zPos = (float)( rand() % grideSize  - grideSize/2);

		particle_Array[pC].position[0] = xPos;
		particle_Array[pC].position[1] = yPos;
		particle_Array[pC].position[2] = zPos;

		particle_Array[pC].velocity[3] = PARTICLES_DEFAULTMASS;

		//particle_Array[pC].position = vec_splat(particle_Array[pC].position, 1);
		//particle_Array[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster

		printf("Particle %d:   ", pC );
		printf("x= %f, y=%f, z=%f", particle_Array[pC].position[0], particle_Array[pC].position[1], particle_Array[pC].position[2]);
		printf("\n");
	}

	


///main loop
	

	// temp particle Datas used for calculations, not pointers, purposefully passed by value
	particle_Data pDi;
	particle_Data pDj;


	//temp vectors used for calculations in loop
	__vector float tempAcceleration = {0,0,0,0};
	__vector float tempVelocity = {0,0,0,0};
	__vector float tempDistance = {0,0,0,0}; //--> use 4th element to store radius
	__vector float tempDistanceRL1 = {0,0,0,0};
	__vector float tempDistanceRL2 = {0,0,0,0};

	__vector float tempNumerator = {0,0,0,0};
	__vector float tempMassSplat = {0,0,0,0};
	__vector float tempGConstant = {GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT,GRAVITATIONALCONSTANT };
	__vector float tempDELATTIME = {DELTA_TIME, DELTA_TIME, DELTA_TIME, DELTA_TIME};
	__vector float tempEPS= {EPS, EPS, EPS, EPS};

	__vector float zeroVector = {0,0,0,0};
	__vector unsigned int oneVector = {1,1,1,1};

	__vector unsigned int axisBitShiftMask = {0,1,2,0};


	__vector unsigned char yzxwMask = { 4,5,6,7, 8,9,10,11, 0,1,2,3,  12,13,14,15};
	__vector unsigned char zxywMask = { 8,9,10,11, 0,1,2,3, 4,5,6,7,  12,13,14,15};

	__vector unsigned short resetOctantCount = {0,0,0,0,0,0,0};
	__vector unsigned short increment = {1,1,1,1,1,1,1,1};

	__vector float tempUnitVector = {0,0,0,0};
	__vector float distanceVector = {0,0,0,0};

	//stupid C99, need to declare indicies before for loops
	int i = 0;
	int j = 0;
	int it_counter = 0;

	printf("\n^^^^^^^   Now starting main loop\n\n\n");


	for(it_counter = 0; it_counter < ITERATION_COUNT; ++it_counter)
	{

		octantCount = resetOctantCount;
	//	printf("\nIteration: %d\n",it_counter );


		// this first loop is to calculate the forces/accelerations
		// NOTE ---> NO FORCES ARE APPLIED IN THIS LOOP, NO POSITIONS WILL BE CHANGED.
		// The calculated accelerations will be used to increment the particles velocity vector, NOT POSITION
		for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
		{
			//cache the particle data struct to the temp declared outside the loops
			pDi = particle_Array[i];

			for(j = 0; j<PARTICLES_MAXCOUNT; ++j)
			{

				//for every particle i, calculate for all j's
				// get resultant total velocity, don't apply it in these loops,
				// apply velocities for all bodies at the same time, in seperate loop at the end.

				//cache the particle data struct to the temp declared outside the loops
				pDj = particle_Array[j];

				// Formula being used --> a = (G * m )/(r^2)

				tempDistance = vec_sub(pDj.position,pDi.position); //actual distance vector between objects i and j
				
				// save value for unit vector calculation later
				distanceVector = tempDistance;

				/* //Print distances between particles
				printf("Particle %d:   ", i );
				printf("x= %f, y=%f, z=%f", tempDistance[0], tempDistance[1], tempDistance[2]);
				printf("\n");
				*/

				//use the distance vector  right now for numerator, before we overwrite is later in the code
				// use mass of subject mass
				tempMassSplat = vec_splats((float)pDi.velocity[3]); //mass is stored in the last element (3) of velocity vector
				tempNumerator = vec_madd(tempMassSplat, tempGConstant, zeroVector);
				

				/*
				//Print numerator
				printf("Numerator %d:   ", i );
				printf("x= %f, y=%f, z=%f", tempNumerator[0], tempNumerator[1], tempNumerator[2]);
				printf("\n");
				*/
				 
				 //Assembly for vector rotate
				//__asm__("addi    4,4,1;");

				// denominator part
				// sqaure each component, x,y,z beforehand
				tempDistance = vec_madd(tempDistance, tempDistance, zeroVector);

				//using perm instead of rotate, bleurg
				tempDistanceRL1 = vec_perm(tempDistance, zeroVector, yzxwMask); // imitates lxfloat left rotate
				tempDistanceRL2 = vec_perm(tempDistance, zeroVector, zxywMask); // imitates 2xfloat left rotate

				//add both
				tempDistanceRL1 = vec_add(tempDistanceRL1, tempDistanceRL2);
				//add to original to get total ---> x+y+z
				tempDistance = vec_add(tempDistance, tempDistanceRL1); //tempDistance is now total distance squared
				
				// add EPS to avoid singularity
				tempDistance =  vec_add(tempDistance, tempEPS); //this is now the denominator value

				//save inverse magnitude for unit vector later
				tempUnitVector = vec_rsqrte(tempDistance);

				// invert vector to avoid division later
				tempDistance = vec_re(tempDistance); // this is final denominator (already inverted), only need to multiply
				// tempDistance is now eqivalent to 1/r^2 


				/*
				//Print denominator
				printf("Denominator %d:   ", i );
				printf("x= %f, y=%f, z=%f", tempDistance[0], tempDistance[1], tempDistance[2]);
				printf("\n");
				*/

				//total acceleration applied to particle i, by particle j
				tempAcceleration = vec_madd(tempDistance, tempNumerator, zeroVector);

				// create unit vector
				tempUnitVector = vec_madd(distanceVector, tempUnitVector, zeroVector);
				
				// apply unit vector to acceleration
				tempAcceleration = vec_madd(tempUnitVector, tempAcceleration, zeroVector);


				//increment velocity value of particle with a*dt
				// need to explicitly call the array, since pDi is only a temp pass by value, doesn't change the particle
				particle_Array[i].velocity = vec_madd(tempAcceleration, tempDELATTIME, particle_Array[i].velocity);

				/*
				//Print velocity
				printf("Velocity %d:   ", i );
				printf("x= %f, y=%f, z=%f", pDi.velocity[0], pDi.velocity[1], pDi.velocity[2]);
				printf("\n");
				*/


				/*

				printf("Particle %d:   ", i );
				printf("x= %f, y=%f, z=%f", pDi.velocity[0], pDi.velocity[1], pDi.velocity[2]);
				printf("\n");

				*/
				
				//end of this loop
			}
			//printf("\n");
		}

		//now that all the accelerations for all particles are calculated,
		//apply them and update velocity 
		for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
		{
			//incrementing position with v*dt
			// vec_madd is awesome, it all gets done in one line! emulated the += operator, kinda, but more flexible
			particle_Array[i].position = vec_madd(particle_Array[i].velocity, tempDELATTIME, particle_Array[i].position);

		/*			
			printf("Particle %d positions:   ", i );
			printf("x= %f, y=%f, z=%f", particle_Array[i].position[0], particle_Array[i].position[1], particle_Array[i].position[2]);
			printf("\n");
		*/


			///// ALL CODE BELOW THIS SHOULD ONLY BE RUN ON PPU \\\\\\\\\\\\\\\\\\


		/////////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes 
			
			// compare with zero vector to get on which side of each axis the particle is
			// 0 is negative, 1 is positive side of the axis
			__vector bool int axisDirection = vec_cmpgt(particle_Array[i].position, zeroVector);



			// need to manually set, can't cast due to size difference error
			__vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0],
												  (unsigned int)axisDirection[1],
												  (unsigned int)axisDirection[2],
													0};
			// need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES
			shiftedAxis = vec_andc(oneVector, shiftedAxis);

			/*
			printf("Particle %d axis sign:   ", i );
			printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]);
			printf("\n");
			*/

			// shift 3 axies simultaneously (actually only 2, 1 stays in origina positon
			//, with intent to OR them later
			shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector

			__vector unsigned int axis_Y = vec_splats(shiftedAxis[1]);
			__vector unsigned int axis_Z = vec_splats(shiftedAxis[2]);
			// merge shhifted x y z values by OR-ing
			// this gives the octant id, range from 0-7 (000 to 111 in binary)
			shiftedAxis = vec_or(shiftedAxis, axis_Y);
			shiftedAxis = vec_or(shiftedAxis, axis_Z);
			// insert octant value into last slot of position vector of particle
			particle_Array[i].position[3] = (float)shiftedAxis[0];

			//printf("Oct ID: %d \n", shiftedAxis[0]);

			/////// Update octant vector by incrementing octant that the particle is in
			// The only possible non SIMD line in the entire program, 
			//irreleant since quadrant counting should occur on PPU anyways
			octantCount[shiftedAxis[0]] ++ ;
			
			

		}

		//end of main loop
/*
		printf("End of iteration %d --->    ",it_counter );
		printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");
		*/
	}

/*
	printf("\n");
	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{
	printf("Particle %d final position:   ", i );
	printf("x= %f, y=%f, z=%f", particle_Array[i].position[0], particle_Array[i].position[1], particle_Array[i].position[2]);
	printf("\n");

	printf("End of iteration %d --->    ",it_counter );
		printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");
	}
*/
	printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");


		struct timeval end;
	gettimeofday(&end,NULL);
	float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f);


	printf("Execution time:    %f\n",deltaTime);
	


return 0;



}
Пример #19
0
int main(int argc, char **argv)
{
	


// setup, assign particles initla positions and masses
// this is done in scalar fashion, NOT SIMD
// insignificant to performance since it's only done once

	//time_t startTime = time(NULL);



	//seed random generator
	srand( time(NULL) );

	printf("\n\n\n~~~~~~~~Printing out particles and their randomly assigned positions: \n\n");

	int pC = 0;
	for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	{
		int grideSize = GRID_SIZE;

	//	printf("\n grideSize/2: %d", grideSize/2);

		float xPos = (float)( rand() % grideSize  - grideSize/2);
		float yPos = (float)( rand() % grideSize  - grideSize/2);
		float zPos = (float)( rand() % grideSize  - grideSize/2);

		particle_Array_PPU[pC].position[0] = xPos;
		particle_Array_PPU[pC].position[1] = yPos;
		particle_Array_PPU[pC].position[2] = zPos;

		particle_Array_PPU[pC].velocity[3] = PARTICLES_DEFAULTMASS;

		if(pC == 0)
		{
			// center, high mass
			particle_Array_PPU[pC].position = zeroVector;
			particle_Array_PPU[pC].velocity = zeroVector; //initialVelocityVector_Y_minus;

			printf("Earth mass: %f\n", earthMass );
			particle_Array_PPU[pC].velocity[3] = earthMass; // PARTICLES_DEFAULTMASS * 500.0f;
		}
		if(pC == 1)
		{
			particle_Array_PPU[pC].position = issPosition; //initPositionVector;
			particle_Array_PPU[pC].velocity = issVelocity; //initialVelocityVector_Y;

			particle_Array_PPU[pC].velocity[3] = issMass; //PARTICLES_DEFAULTMASS * 500.0f;

		}
		if(pC == 2)
		{
			particle_Array_PPU[pC].position = sat1Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat1Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 3)
		{
			particle_Array_PPU[pC].position = sat2Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat2Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 4)
		{
			particle_Array_PPU[pC].position = sat3Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat3Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 5)
		{
			particle_Array_PPU[pC].position = sat4Position; //initPositionVector;
			particle_Array_PPU[pC].velocity = sat4Velocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = satMass; 

		}
		if(pC == 6)
		{
			particle_Array_PPU[pC].position = moonPosition; //initPositionVector;
			particle_Array_PPU[pC].velocity = moonVelocity; //initialVelocityVector_Y;


			particle_Array_PPU[pC].velocity[3] = moonMass; 

		}
		else
		{



		}

		//particle_Array_PPU[pC].position = vec_splat(particle_Array_PPU[pC].position, 1);
		//particle_Array_PPU[pC].position = vec_splats((float)GRAVITATIONALCONSTANT); --> use splats, seems faster
		
		printf("Particle %d:   ", pC );
		printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]);
		printf("\n");
		
	}


	// copy arrays into spe ones
	pC = 0;
	for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	{

		spe1_Data[pC] = particle_Array_PPU[pC];	
		spe2_Data[pC] = particle_Array_PPU[pC];	
		spe3_Data[pC] = particle_Array_PPU[pC];	
		spe4_Data[pC] = particle_Array_PPU[pC];	
		spe5_Data[pC] = particle_Array_PPU[pC];	
		spe6_Data[pC] = particle_Array_PPU[pC];		
	}

	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{
     /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes 
		
		// compare with zero vector to get on which side of each axis the particle is
		// 0 is negative, 1 is positive side of the axis
		__vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector);



		// need to manually set, can't cast due to size difference error
		__vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0],
											  (unsigned int)axisDirection[1],
											  (unsigned int)axisDirection[2],
												0};
		// need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES
		shiftedAxis = vec_andc(oneVector, shiftedAxis);

		/*
		printf("Particle %d axis sign:   ", i );
		printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]);
		printf("\n");
		*/

		// shift 3 axies simultaneously (actually only 2, 1 stays in origina positon
		//, with intent to OR them later
		shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector

		__vector unsigned int axis_Y = vec_splats(shiftedAxis[1]);
		__vector unsigned int axis_Z = vec_splats(shiftedAxis[2]);
		// merge shhifted x y z values by OR-ing
		// this gives the octant id, range from 0-7 (000 to 111 in binary)
		shiftedAxis = vec_or(shiftedAxis, axis_Y);
		shiftedAxis = vec_or(shiftedAxis, axis_Z);
		// insert octant value into last slot of position vector of particle
		particle_Array_PPU[i].position[3] = (float)shiftedAxis[0];

		//printf("Oct ID: %d \n", shiftedAxis[0]);

		/////// Update octant vector by incrementing octant that the particle is in
		// The only possible non SIMD line in the entire program, 
		//irreleant since quadrant counting should occur on PPU anyways
		octantCount[shiftedAxis[0]] ++ ;
		
	}
	i=0;

	printf("\n");

		printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");


	int speCount = spe_cpu_info_get(SPE_COUNT_PHYSICAL_SPES,-1);
/*
	printf("\n");
	printf("%d", speCount);

	printf("\n");
	printf("\n");
	printf("--------------\n");
	printf("Starting spe1 part\n");
*/
/*
	// wait for user input, gives time to start graphics
	printf("Press Enter to continue\n");

	getchar();
*/

	struct timeval start;
	gettimeofday(&start,NULL);


	int iterCount = 0;
	for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++)
	{

		//printf("++++++++++++++ START of ITERATION # %d of %d +++++++++++++++\n", i, ITERATION_COUNT );

		int retval;
		pthread_t spe1_Thread;
		pthread_t spe2_Thread;
		pthread_t spe3_Thread;
		pthread_t spe4_Thread;
		pthread_t spe5_Thread;
		pthread_t spe6_Thread;


		//speData = spe1_Data;
		speNumber = 0;
		/* Create Thread */
	//	printf("spe1_Data value: %d\n", (int)spe1_Data );
		retval = pthread_create(&spe1_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_1, // Thread function
								NULL // Thread argument
								);

	//	printf("spe2_Data value: %d\n", (int)spe2_Data );
		
		retval = pthread_create(&spe2_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_2, // Thread function
								NULL // Thread argument
								);
		
		
		retval = pthread_create(&spe3_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_3, // Thread function
								NULL // Thread argument
								);

		
		retval = pthread_create(&spe4_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_4, // Thread function
								NULL // Thread argument
								);

		retval = pthread_create(&spe5_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_5, // Thread function
								NULL // Thread argument
								);

		retval = pthread_create(&spe6_Thread, // Thread object
								NULL, // Thread attributes
								spe_code_launch_6, // Thread function
								NULL // Thread argument
								);
		


		//Wait for Thread Completion
		retval = pthread_join(spe1_Thread, NULL);


		retval = pthread_join(spe2_Thread, NULL);

		
		retval = pthread_join(spe3_Thread, NULL);

		retval = pthread_join(spe4_Thread, NULL);
		
		retval = pthread_join(spe5_Thread, NULL);
		
		retval = pthread_join(spe6_Thread, NULL);
		

		
		speNumber = 1;
		
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe1_Data[i];
		}

		speNumber = 2;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe2_Data[i];
		}

		speNumber = 3;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe3_Data[i];
		}

		speNumber = 4;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe4_Data[i];
		}

		speNumber = 5;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<speNumber*PARTICLES_MAXCOUNT/SPU_COUNT; ++i)
		{
			particle_Array_PPU[i] = spe5_Data[i];
		}

		speNumber = 6;
		for(i=(speNumber-1)*PARTICLES_MAXCOUNT/SPU_COUNT; i<PARTICLES_MAXCOUNT; ++i)
		{
			particle_Array_PPU[i] = spe6_Data[i];
		}

		// reset spe counter
		speNumber = 0;
		


		// copy arrays into spe ones
		pC = 0;
		for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
		{

			spe1_Data[pC] = particle_Array_PPU[pC];	
			spe2_Data[pC] = particle_Array_PPU[pC];	
			spe3_Data[pC] = particle_Array_PPU[pC];	
			spe4_Data[pC] = particle_Array_PPU[pC];	
			spe5_Data[pC] = particle_Array_PPU[pC];	
			spe6_Data[pC] = particle_Array_PPU[pC];	


			// update values for shared array (graphics)
			/*
			particle_Array_Shared[pC].position[0] = particle_Array_PPU[pC].position[0];
			particle_Array_Shared[pC].position[1] = particle_Array_PPU[pC].position[1];
			particle_Array_Shared[pC].position[2] = particle_Array_PPU[pC].position[2];
			particle_Array_Shared[pC].position[3] = particle_Array_PPU[pC].position[3];
			*/

			/*		
			printf("Particle %d positions:   ", pC );
			printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[pC].position[0], particle_Array_PPU[pC].position[1], particle_Array_PPU[pC].position[2], particle_Array_PPU[pC].velocity[3]);
			printf("\n");
			*/


			fullSimilationData[iterCount].particleArray[pC]= particle_Array_PPU[pC];
		}

		

	//	printf("++++++++++++++ END of ITERATION # %d of %d +++++++++++++++\n", iterCount, ITERATION_COUNT );


	}

	struct timeval end;
	gettimeofday(&end,NULL);
	float deltaTime = ((end.tv_sec - start.tv_sec)*1000.0f + (end.tv_usec -start.tv_usec)/1000.0f);


	printf("print out values from post spe calculations\n");
	i = 0;
	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{

		printf("Particle %d positions:   ", i );
		printf("x= %f, y=%f, z=%f , mass:%f", particle_Array_PPU[i].position[0], particle_Array_PPU[i].position[1], particle_Array_PPU[i].position[2], particle_Array_PPU[i].velocity[3]);
		printf("\n");
	
	}
	//cleaining the array
	octantCount = resetOctantCount;
	for(i = 0; i<PARTICLES_MAXCOUNT; ++i)
	{
     /////// INSERT QUADRANT CODE HERE , actually octant --> 8 equal sub cubes 
		
		// compare with zero vector to get on which side of each axis the particle is
		// 0 is negative, 1 is positive side of the axis
		__vector bool int axisDirection = vec_cmpgt(particle_Array_PPU[i].position, zeroVector);



		// need to manually set, can't cast due to size difference error
		__vector unsigned int shiftedAxis = { (unsigned int)axisDirection[0],
											  (unsigned int)axisDirection[1],
											  (unsigned int)axisDirection[2],
												0};
		// need to do this to revert 1s into NON 2s complement form --> vec_cmgt doc LIES
		shiftedAxis = vec_andc(oneVector, shiftedAxis);

		/*
		printf("Particle %d axis sign:   ", i );
		printf("x= %x, y=%x, z=%x", shiftedAxis[0], shiftedAxis[1], shiftedAxis[2]);
		printf("\n");
		*/

		// shift 3 axies simultaneously (actually only 2, 1 stays in origina positon
		//, with intent to OR them later
		shiftedAxis = vec_sl(shiftedAxis, axisBitShiftMask); // will also use as x vector

		__vector unsigned int axis_Y = vec_splats(shiftedAxis[1]);
		__vector unsigned int axis_Z = vec_splats(shiftedAxis[2]);
		// merge shhifted x y z values by OR-ing
		// this gives the octant id, range from 0-7 (000 to 111 in binary)
		shiftedAxis = vec_or(shiftedAxis, axis_Y);
		shiftedAxis = vec_or(shiftedAxis, axis_Z);
		// insert octant value into last slot of position vector of particle
		particle_Array_PPU[i].position[3] = (float)shiftedAxis[0];

		//printf("Oct ID: %d \n", shiftedAxis[0]);

		/////// Update octant vector by incrementing octant that the particle is in
		// The only possible non SIMD line in the entire program, 
		//irreleant since quadrant counting should occur on PPU anyways
		octantCount[shiftedAxis[0]] ++ ;
		
	}
	i=0;

	printf("\n");

		printf("Particle disttribution across the octants: \n");
		printf("O0: %d    O1: %d    O2: %d    O3: %d    O4: %d    O5: %d    O6: %d    O7: %d\n",
				octantCount[0], octantCount[1], octantCount[2], octantCount[3], 
				octantCount[4],	octantCount[5], octantCount[6], octantCount[7]);
		printf("\n");



/*
	time_t endTime = time(NULL);
	int deltaTime = endTime - startTime;
*/

	// need to look into http://www.xmlsoft.org/


	printf("Execution time:    %f\n",deltaTime);


	FILE *filePointer;
	filePointer = fopen("fileLog1.txt","w");
	//fprintf(filePointer, "<SimulationData>\n");
	

	iterCount = 0;
	for (iterCount = 0; iterCount< ITERATION_COUNT; iterCount++)
	{
		//printf("Iteration: %d\n", iterCount);
		//fprintf(filePointer,"<Iter>\n");
		fprintf(filePointer,"\n");

		pC = 0;
	    for(pC = 0; pC < PARTICLES_MAXCOUNT; ++pC)
	    {
		
			//printf("Particle %d positions:   ", pC );
		//	fprintf(filePointer,"<Obj>\n");
	    	

			//printf("x= %f, y=%f, z=%f", fullSimilationData[iterCount].particleArray[pC].position[0], fullSimilationData[iterCount].particleArray[pC].position[1], fullSimilationData[iterCount].particleArray[pC].position[2]);
			//printf("\n");
			
	    	/*
			fprintf(filePointer,"<PX>%f</PX>\n",fullSimilationData[iterCount].particleArray[pC].position[0]);
			fprintf(filePointer,"<PY>%f</PY>\n",fullSimilationData[iterCount].particleArray[pC].position[1]);
			fprintf(filePointer,"<PZ>%f</PZ>\n",fullSimilationData[iterCount].particleArray[pC].position[2]);
			*/

			fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[0]);
			fprintf(filePointer,"%f,",fullSimilationData[iterCount].particleArray[pC].position[1]);
			fprintf(filePointer,"%f",fullSimilationData[iterCount].particleArray[pC].position[2]);

			fprintf(filePointer,"|");
			//fprintf(filePointer,"</Obj>\n");			
			//fullSimilationData[fullDataCounter].particleArray[pC]= particle_Array_PPU[pC];
			
		}

		//fprintf(filePointer,"</Iter>\n");


	}


	//fprintf(filePointer, "</SimulationData>\n");


	fclose(filePointer);


	return 0;
}