示例#1
0
static const R *t1fv_3(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     int i;
     R *x;
     x = ri;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 4)) {
	  V T1, T3, T5, T6, T2, T4, T7, T8;
	  T1 = LD(&(x[0]), dist, &(x[0]));
	  T2 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
	  T3 = BYTWJ(&(W[0]), T2);
	  T4 = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
	  T5 = BYTWJ(&(W[TWVL * 2]), T4);
	  T6 = VADD(T3, T5);
	  ST(&(x[0]), VADD(T1, T6), dist, &(x[0]));
	  T7 = VFNMS(LDK(KP500000000), T6, T1);
	  T8 = VBYI(VMUL(LDK(KP866025403), VSUB(T5, T3)));
	  ST(&(x[WS(ios, 2)]), VSUB(T7, T8), dist, &(x[0]));
	  ST(&(x[WS(ios, 1)]), VADD(T7, T8), dist, &(x[WS(ios, 1)]));
     }
     END_SIMD();
     return W;
}
示例#2
0
static void n1bv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T3, T7, T6, T8;
	  {
	       V T1, T2, T4, T5;
	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
	       T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	       T3 = VSUB(T1, T2);
	       T7 = VADD(T1, T2);
	       T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	       T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       T6 = VBYI(VSUB(T4, T5));
	       T8 = VADD(T4, T5);
	  }
	  ST(&(xo[WS(os, 3)]), VSUB(T3, T6), ovs, &(xo[WS(os, 1)]));
	  ST(&(xo[0]), VADD(T7, T8), ovs, &(xo[0]));
	  ST(&(xo[WS(os, 1)]), VADD(T3, T6), ovs, &(xo[WS(os, 1)]));
	  ST(&(xo[WS(os, 2)]), VSUB(T7, T8), ovs, &(xo[0]));
     }
     END_SIMD();
}
示例#3
0
static void n2bv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V Ta, Tk, Te, Tj, T7, Tn, Tf, Tm;
	  {
	       V T8, T9, Tc, Td;
	       T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	       T9 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
	       Ta = VSUB(T8, T9);
	       Tk = VADD(T8, T9);
	       Tc = LD(&(xi[0]), ivs, &(xi[0]));
	       Td = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
	       Te = VSUB(Tc, Td);
	       Tj = VADD(Tc, Td);
	       {
		    V T1, T2, T3, T4, T5, T6;
		    T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
		    T3 = VSUB(T1, T2);
		    T4 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
		    T6 = VSUB(T4, T5);
		    T7 = VMUL(LDK(KP707106781), VSUB(T3, T6));
		    Tn = VADD(T4, T5);
		    Tf = VMUL(LDK(KP707106781), VADD(T3, T6));
		    Tm = VADD(T1, T2);
	       }
	  }
	  {
	       V Tb, Tg, Tp, Tq;
	       Tb = VBYI(VSUB(T7, Ta));
	       Tg = VSUB(Te, Tf);
	       ST(&(xo[6]), VADD(Tb, Tg), ovs, &(xo[2]));
	       ST(&(xo[10]), VSUB(Tg, Tb), ovs, &(xo[2]));
	       Tp = VADD(Tj, Tk);
	       Tq = VADD(Tm, Tn);
	       ST(&(xo[8]), VSUB(Tp, Tq), ovs, &(xo[0]));
	       ST(&(xo[0]), VADD(Tp, Tq), ovs, &(xo[0]));
	  }
	  {
	       V Th, Ti, Tl, To;
	       Th = VBYI(VADD(Ta, T7));
	       Ti = VADD(Te, Tf);
	       ST(&(xo[2]), VADD(Th, Ti), ovs, &(xo[2]));
	       ST(&(xo[14]), VSUB(Ti, Th), ovs, &(xo[2]));
	       Tl = VSUB(Tj, Tk);
	       To = VBYI(VSUB(Tm, Tn));
	       ST(&(xo[12]), VSUB(Tl, To), ovs, &(xo[0]));
	       ST(&(xo[4]), VADD(Tl, To), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
static void m2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     int i;
     BEGIN_SIMD();
     for (i = 0; i < v; i += VL) {
	  m2bv_64_0(ii, io, is, ivs, ovs);
	  ii += VL * ivs;
	  io += VL * ovs;
     }
     END_SIMD();
}
示例#5
0
static void m1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     int i;
     BEGIN_SIMD();
     for (i = 0; i < v; i += VL) {
	  m1fv_32_0(ri, ro, is, os, ivs, ovs);
	  ri += VL * ivs;
	  ro += VL * ovs;
     }
     END_SIMD();
}
static const R *t1bv_6(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     int i;
     R *x;
     x = ii;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 10)) {
	  V Tf, Ti, Ta, Tk, T5, Tj, Tc, Te, Td;
	  Tc = LD(&(x[0]), dist, &(x[0]));
	  Td = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
	  Te = BYTW(&(W[TWVL * 4]), Td);
	  Tf = VSUB(Tc, Te);
	  Ti = VADD(Tc, Te);
	  {
	       V T7, T9, T6, T8;
	       T6 = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
	       T7 = BYTW(&(W[TWVL * 6]), T6);
	       T8 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
	       T9 = BYTW(&(W[0]), T8);
	       Ta = VSUB(T7, T9);
	       Tk = VADD(T7, T9);
	  }
	  {
	       V T2, T4, T1, T3;
	       T1 = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
	       T2 = BYTW(&(W[TWVL * 2]), T1);
	       T3 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
	       T4 = BYTW(&(W[TWVL * 8]), T3);
	       T5 = VSUB(T2, T4);
	       Tj = VADD(T2, T4);
	  }
	  {
	       V Tb, Tg, Th, Tn, Tl, Tm;
	       Tb = VBYI(VMUL(LDK(KP866025403), VSUB(T5, Ta)));
	       Tg = VADD(T5, Ta);
	       Th = VFNMS(LDK(KP500000000), Tg, Tf);
	       ST(&(x[WS(ios, 1)]), VADD(Tb, Th), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 3)]), VADD(Tf, Tg), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 5)]), VSUB(Th, Tb), dist, &(x[WS(ios, 1)]));
	       Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tj, Tk)));
	       Tl = VADD(Tj, Tk);
	       Tm = VFNMS(LDK(KP500000000), Tl, Ti);
	       ST(&(x[WS(ios, 2)]), VSUB(Tm, Tn), dist, &(x[0]));
	       ST(&(x[0]), VADD(Ti, Tl), dist, &(x[0]));
	       ST(&(x[WS(ios, 4)]), VADD(Tn, Tm), dist, &(x[0]));
	  }
     }
     END_SIMD();
     return W;
}
static const R *t1fv_6(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     int i;
     R *x;
     x = ri;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 10)) {
	  V T4, Ti, Te, Tk, T9, Tj, T1, T3, T2;
	  T1 = LD(&(x[0]), dist, &(x[0]));
	  T2 = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
	  T3 = BYTWJ(&(W[TWVL * 4]), T2);
	  T4 = VSUB(T1, T3);
	  Ti = VADD(T1, T3);
	  {
	       V Tb, Td, Ta, Tc;
	       Ta = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
	       Tb = BYTWJ(&(W[TWVL * 6]), Ta);
	       Tc = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
	       Td = BYTWJ(&(W[0]), Tc);
	       Te = VSUB(Tb, Td);
	       Tk = VADD(Tb, Td);
	  }
	  {
	       V T6, T8, T5, T7;
	       T5 = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
	       T6 = BYTWJ(&(W[TWVL * 2]), T5);
	       T7 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
	       T8 = BYTWJ(&(W[TWVL * 8]), T7);
	       T9 = VSUB(T6, T8);
	       Tj = VADD(T6, T8);
	  }
	  {
	       V Th, Tf, Tg, Tn, Tl, Tm;
	       Th = VBYI(VMUL(LDK(KP866025403), VSUB(Te, T9)));
	       Tf = VADD(T9, Te);
	       Tg = VFNMS(LDK(KP500000000), Tf, T4);
	       ST(&(x[WS(ios, 3)]), VADD(T4, Tf), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 1)]), VADD(Tg, Th), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 5)]), VSUB(Tg, Th), dist, &(x[WS(ios, 1)]));
	       Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tk, Tj)));
	       Tl = VADD(Tj, Tk);
	       Tm = VFNMS(LDK(KP500000000), Tl, Ti);
	       ST(&(x[0]), VADD(Ti, Tl), dist, &(x[0]));
	       ST(&(x[WS(ios, 4)]), VADD(Tm, Tn), dist, &(x[0]));
	       ST(&(x[WS(ios, 2)]), VSUB(Tm, Tn), dist, &(x[0]));
	  }
     }
     END_SIMD();
     return W;
}
示例#8
0
static void n2bv_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V Tb, T9, Tc, T3, Te, T6, Td, T7, T8, Ti, Tj;
	  Tb = LD(&(xi[0]), ivs, &(xi[0]));
	  T7 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	  T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
	  T9 = VSUB(T7, T8);
	  Tc = VADD(T7, T8);
	  {
	       V T1, T2, T4, T5;
	       T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	       T2 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
	       T3 = VSUB(T1, T2);
	       Te = VADD(T1, T2);
	       T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       T5 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
	       T6 = VSUB(T4, T5);
	       Td = VADD(T4, T5);
	  }
	  ST(&(xo[0]), VADD(Tb, VADD(Te, VADD(Tc, Td))), ovs, &(xo[0]));
	  Ti = VBYI(VFNMS(LDK(KP781831482), T6, VFNMS(LDK(KP433883739), T9, VMUL(LDK(KP974927912), T3))));
	  Tj = VFMA(LDK(KP623489801), Td, VFNMS(LDK(KP900968867), Tc, VFNMS(LDK(KP222520933), Te, Tb)));
	  ST(&(xo[4]), VADD(Ti, Tj), ovs, &(xo[0]));
	  ST(&(xo[10]), VSUB(Tj, Ti), ovs, &(xo[2]));
	  {
	       V Ta, Tf, Tg, Th;
	       Ta = VBYI(VFMA(LDK(KP433883739), T3, VFNMS(LDK(KP781831482), T9, VMUL(LDK(KP974927912), T6))));
	       Tf = VFMA(LDK(KP623489801), Tc, VFNMS(LDK(KP222520933), Td, VFNMS(LDK(KP900968867), Te, Tb)));
	       ST(&(xo[6]), VADD(Ta, Tf), ovs, &(xo[2]));
	       ST(&(xo[8]), VSUB(Tf, Ta), ovs, &(xo[0]));
	       Tg = VBYI(VFMA(LDK(KP781831482), T3, VFMA(LDK(KP974927912), T9, VMUL(LDK(KP433883739), T6))));
	       Th = VFMA(LDK(KP623489801), Te, VFNMS(LDK(KP900968867), Td, VFNMS(LDK(KP222520933), Tc, Tb)));
	       ST(&(xo[2]), VADD(Tg, Th), ovs, &(xo[2]));
	       ST(&(xo[12]), VSUB(Th, Tg), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
static const R *t1fv_2(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     int i;
     R *x;
     x = ri;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 2)) {
	  V T1, T3, T2;
	  T1 = LD(&(x[0]), dist, &(x[0]));
	  T2 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
	  T3 = BYTWJ(&(W[0]), T2);
	  ST(&(x[WS(ios, 1)]), VSUB(T1, T3), dist, &(x[WS(ios, 1)]));
	  ST(&(x[0]), VADD(T1, T3), dist, &(x[0]));
     }
     END_SIMD();
     return W;
}
示例#10
0
static void n2fv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T1, T2;
	  T1 = LD(&(xi[0]), ivs, &(xi[0]));
	  T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	  ST(&(xo[2]), VSUB(T1, T2), ovs, &(xo[2]));
	  ST(&(xo[0]), VADD(T1, T2), ovs, &(xo[0]));
     }
     END_SIMD();
}
示例#11
0
static void n2fv_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T8, T7, Td, T9, Tc;
	  T8 = LD(&(xi[0]), ivs, &(xi[0]));
	  {
	       V T1, T2, T3, T4, T5, T6;
	       T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	       T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
	       T3 = VADD(T1, T2);
	       T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	       T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       T6 = VADD(T4, T5);
	       T7 = VMUL(LDK(KP559016994), VSUB(T3, T6));
	       Td = VSUB(T4, T5);
	       T9 = VADD(T3, T6);
	       Tc = VSUB(T1, T2);
	  }
	  ST(&(xo[0]), VADD(T8, T9), ovs, &(xo[0]));
	  {
	       V Te, Tf, Tb, Tg, Ta;
	       Te = VBYI(VFMA(LDK(KP951056516), Tc, VMUL(LDK(KP587785252), Td)));
	       Tf = VBYI(VFNMS(LDK(KP587785252), Tc, VMUL(LDK(KP951056516), Td)));
	       Ta = VFNMS(LDK(KP250000000), T9, T8);
	       Tb = VADD(T7, Ta);
	       Tg = VSUB(Ta, T7);
	       ST(&(xo[2]), VSUB(Tb, Te), ovs, &(xo[2]));
	       ST(&(xo[6]), VSUB(Tg, Tf), ovs, &(xo[2]));
	       ST(&(xo[8]), VADD(Te, Tb), ovs, &(xo[0]));
	       ST(&(xo[4]), VADD(Tf, Tg), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
示例#12
0
static const R *q1fv_2(R *ri, R *ii, const R *W, stride is, stride vs, int m, int dist)
{
     int i;
     R *x;
     x = ri;
     BEGIN_SIMD();
     for (i = 0; i < m; i = i + VL, x = x + (VL * dist), W = W + (TWVL * 2)) {
	  V T1, T2, T3, T4, T5, T6;
	  T1 = LD(&(x[0]), dist, &(x[0]));
	  T2 = LD(&(x[WS(is, 1)]), dist, &(x[WS(is, 1)]));
	  T3 = BYTWJ(&(W[0]), VSUB(T1, T2));
	  T4 = LD(&(x[WS(vs, 1)]), dist, &(x[WS(vs, 1)]));
	  T5 = LD(&(x[WS(vs, 1) + WS(is, 1)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
	  T6 = BYTWJ(&(W[0]), VSUB(T4, T5));
	  ST(&(x[WS(vs, 1)]), T3, dist, &(x[WS(vs, 1)]));
	  ST(&(x[WS(vs, 1) + WS(is, 1)]), T6, dist, &(x[WS(vs, 1) + WS(is, 1)]));
	  ST(&(x[0]), VADD(T1, T2), dist, &(x[0]));
	  ST(&(x[WS(is, 1)]), VADD(T4, T5), dist, &(x[WS(is, 1)]));
     }
     END_SIMD();
     return W;
}
示例#13
0
static void n2fv_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T1, T4, T6, T2, T3, T5;
	  T1 = LD(&(xi[0]), ivs, &(xi[0]));
	  T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	  T3 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	  T4 = VADD(T2, T3);
	  T6 = VBYI(VMUL(LDK(KP866025403), VSUB(T3, T2)));
	  ST(&(xo[0]), VADD(T1, T4), ovs, &(xo[0]));
	  T5 = VFNMS(LDK(KP500000000), T4, T1);
	  ST(&(xo[4]), VSUB(T5, T6), ovs, &(xo[0]));
	  ST(&(xo[2]), VADD(T5, T6), ovs, &(xo[2]));
     }
     END_SIMD();
}
示例#14
0
static void n2bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP342020143, +0.342020143325668733044099614682259580763083368);
     DVK(KP813797681, +0.813797681349373692844693217248393223289101568);
     DVK(KP939692620, +0.939692620785908384054109277324731469936208134);
     DVK(KP296198132, +0.296198132726023843175338011893050938967728390);
     DVK(KP642787609, +0.642787609686539326322643409907263432907559884);
     DVK(KP663413948, +0.663413948168938396205421319635891297216863310);
     DVK(KP556670399, +0.556670399226419366452912952047023132968291906);
     DVK(KP766044443, +0.766044443118978035202392650555416673935832457);
     DVK(KP984807753, +0.984807753012208059366743024589523013670643252);
     DVK(KP150383733, +0.150383733180435296639271897612501926072238258);
     DVK(KP852868531, +0.852868531952443209628250963940074071936020296);
     DVK(KP173648177, +0.173648177666930348851716626769314796000375677);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T5, Ty, Tm, Ti, Tw, Th, Tj, To, Tb, Tv, Ta, Tc, Tn;
	  {
	       V T1, T2, T3, T4;
	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
	       T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
	       T4 = VADD(T2, T3);
	       T5 = VFNMS(LDK(KP500000000), T4, T1);
	       Ty = VADD(T1, T4);
	       Tm = VMUL(LDK(KP866025403), VSUB(T2, T3));
	  }
	  {
	       V Td, Tg, Te, Tf;
	       Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	       Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
	       Tf = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
	       Tg = VADD(Te, Tf);
	       Ti = VSUB(Te, Tf);
	       Tw = VADD(Td, Tg);
	       Th = VFNMS(LDK(KP500000000), Tg, Td);
	       Tj = VFNMS(LDK(KP852868531), Ti, VMUL(LDK(KP173648177), Th));
	       To = VFMA(LDK(KP150383733), Ti, VMUL(LDK(KP984807753), Th));
	  }
	  {
	       V T6, T9, T7, T8;
	       T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	       T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
	       T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
	       T9 = VADD(T7, T8);
	       Tb = VSUB(T7, T8);
	       Tv = VADD(T6, T9);
	       Ta = VFNMS(LDK(KP500000000), T9, T6);
	       Tc = VFNMS(LDK(KP556670399), Tb, VMUL(LDK(KP766044443), Ta));
	       Tn = VFMA(LDK(KP663413948), Tb, VMUL(LDK(KP642787609), Ta));
	  }
	  {
	       V Tx, Tz, TA, Tt, Tu;
	       Tx = VBYI(VMUL(LDK(KP866025403), VSUB(Tv, Tw)));
	       Tz = VADD(Tv, Tw);
	       TA = VFNMS(LDK(KP500000000), Tz, Ty);
	       ST(&(xo[6]), VADD(Tx, TA), ovs, &(xo[2]));
	       ST(&(xo[0]), VADD(Ty, Tz), ovs, &(xo[0]));
	       ST(&(xo[12]), VSUB(TA, Tx), ovs, &(xo[0]));
	       Tt = VFMA(LDK(KP852868531), Tb, VFMA(LDK(KP173648177), Ta, VFMA(LDK(KP296198132), Ti, VFNMS(LDK(KP939692620), Th, T5))));
	       Tu = VBYI(VSUB(VFMA(LDK(KP984807753), Ta, VFMA(LDK(KP813797681), Ti, VFNMS(LDK(KP150383733), Tb, VMUL(LDK(KP342020143), Th)))), Tm));
	       ST(&(xo[14]), VSUB(Tt, Tu), ovs, &(xo[2]));
	       ST(&(xo[4]), VADD(Tt, Tu), ovs, &(xo[0]));
	       {
		    V Tl, Ts, Tq, Tr, Tk, Tp;
		    Tk = VADD(Tc, Tj);
		    Tl = VADD(T5, Tk);
		    Ts = VFMA(LDK(KP866025403), VSUB(To, Tn), VFNMS(LDK(KP500000000), Tk, T5));
		    Tp = VADD(Tn, To);
		    Tq = VBYI(VADD(Tm, Tp));
		    Tr = VBYI(VADD(Tm, VFNMS(LDK(KP500000000), Tp, VMUL(LDK(KP866025403), VSUB(Tc, Tj)))));
		    ST(&(xo[16]), VSUB(Tl, Tq), ovs, &(xo[0]));
		    ST(&(xo[10]), VSUB(Ts, Tr), ovs, &(xo[2]));
		    ST(&(xo[2]), VADD(Tl, Tq), ovs, &(xo[2]));
		    ST(&(xo[8]), VADD(Tr, Ts), ovs, &(xo[0]));
	       }
	  }
     }
     END_SIMD();
}
示例#15
0
static void n1fv_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP216506350, +0.216506350946109661690930792688234045867850657);
     DVK(KP509036960, +0.509036960455127183450980863393907648510733164);
     DVK(KP823639103, +0.823639103546331925877420039278190003029660514);
     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     DVK(KP484122918, +0.484122918275927110647408174972799951354115213);
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T5, T10, TB, TO, TU, TV, TR, Ta, Tf, Tg, Tl, Tq, Tr, TE, TH;
	  V TI, TZ, T11, T1f, T1g;
	  {
	       V T1, T2, T3, T4;
	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
	       T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
	       T4 = VADD(T2, T3);
	       T5 = VADD(T1, T4);
	       T10 = VSUB(T3, T2);
	       TB = VFNMS(LDK(KP500000000), T4, T1);
	  }
	  {
	       V T6, T9, TC, TP, Tm, Tp, TG, TN, Tb, Te, TD, TQ, Th, Tk, TF;
	       V TM, TX, TY;
	       {
		    V T7, T8, Tn, To;
		    T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
		    T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
		    T8 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
		    T9 = VADD(T7, T8);
		    TC = VFNMS(LDK(KP500000000), T9, T6);
		    TP = VSUB(T8, T7);
		    Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
		    Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
		    To = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
		    Tp = VADD(Tn, To);
		    TG = VFNMS(LDK(KP500000000), Tp, Tm);
		    TN = VSUB(To, Tn);
	       }
	       {
		    V Tc, Td, Ti, Tj;
		    Tb = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
		    Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    Te = VADD(Tc, Td);
		    TD = VFNMS(LDK(KP500000000), Te, Tb);
		    TQ = VSUB(Td, Tc);
		    Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
		    Ti = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
		    Tj = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    Tk = VADD(Ti, Tj);
		    TF = VFNMS(LDK(KP500000000), Tk, Th);
		    TM = VSUB(Tj, Ti);
	       }
	       TO = VSUB(TM, TN);
	       TU = VSUB(TF, TG);
	       TV = VSUB(TC, TD);
	       TR = VSUB(TP, TQ);
	       Ta = VADD(T6, T9);
	       Tf = VADD(Tb, Te);
	       Tg = VADD(Ta, Tf);
	       Tl = VADD(Th, Tk);
	       Tq = VADD(Tm, Tp);
	       Tr = VADD(Tl, Tq);
	       TE = VADD(TC, TD);
	       TH = VADD(TF, TG);
	       TI = VADD(TE, TH);
	       TX = VADD(TP, TQ);
	       TY = VADD(TM, TN);
	       TZ = VMUL(LDK(KP484122918), VSUB(TX, TY));
	       T11 = VADD(TX, TY);
	  }
	  T1f = VADD(TB, TI);
	  T1g = VBYI(VMUL(LDK(KP866025403), VADD(T10, T11)));
	  ST(&(xo[WS(os, 5)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)]));
	  ST(&(xo[WS(os, 10)]), VADD(T1f, T1g), ovs, &(xo[0]));
	  {
	       V Tu, Ts, Tt, Ty, TA, Tw, Tx, Tz, Tv;
	       Tu = VMUL(LDK(KP559016994), VSUB(Tg, Tr));
	       Ts = VADD(Tg, Tr);
	       Tt = VFNMS(LDK(KP250000000), Ts, T5);
	       Tw = VSUB(Tl, Tq);
	       Tx = VSUB(Ta, Tf);
	       Ty = VBYI(VFNMS(LDK(KP587785252), Tx, VMUL(LDK(KP951056516), Tw)));
	       TA = VBYI(VFMA(LDK(KP951056516), Tx, VMUL(LDK(KP587785252), Tw)));
	       ST(&(xo[0]), VADD(T5, Ts), ovs, &(xo[0]));
	       Tz = VADD(Tu, Tt);
	       ST(&(xo[WS(os, 6)]), VSUB(Tz, TA), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 9)]), VADD(TA, Tz), ovs, &(xo[WS(os, 1)]));
	       Tv = VSUB(Tt, Tu);
	       ST(&(xo[WS(os, 3)]), VSUB(Tv, Ty), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 12)]), VADD(Ty, Tv), ovs, &(xo[0]));
	  }
	  {
	       V TS, TW, T1b, T18, T13, T1a, TL, T17, T12, TJ, TK;
	       TS = VFNMS(LDK(KP509036960), TR, VMUL(LDK(KP823639103), TO));
	       TW = VFNMS(LDK(KP587785252), TV, VMUL(LDK(KP951056516), TU));
	       T1b = VFMA(LDK(KP951056516), TV, VMUL(LDK(KP587785252), TU));
	       T18 = VFMA(LDK(KP823639103), TR, VMUL(LDK(KP509036960), TO));
	       T12 = VFNMS(LDK(KP216506350), T11, VMUL(LDK(KP866025403), T10));
	       T13 = VSUB(TZ, T12);
	       T1a = VADD(TZ, T12);
	       TJ = VFNMS(LDK(KP250000000), TI, TB);
	       TK = VMUL(LDK(KP559016994), VSUB(TE, TH));
	       TL = VSUB(TJ, TK);
	       T17 = VADD(TK, TJ);
	       {
		    V TT, T14, T1d, T1e;
		    TT = VSUB(TL, TS);
		    T14 = VBYI(VSUB(TW, T13));
		    ST(&(xo[WS(os, 8)]), VSUB(TT, T14), ovs, &(xo[0]));
		    ST(&(xo[WS(os, 7)]), VADD(TT, T14), ovs, &(xo[WS(os, 1)]));
		    T1d = VSUB(T17, T18);
		    T1e = VBYI(VADD(T1b, T1a));
		    ST(&(xo[WS(os, 11)]), VSUB(T1d, T1e), ovs, &(xo[WS(os, 1)]));
		    ST(&(xo[WS(os, 4)]), VADD(T1d, T1e), ovs, &(xo[0]));
	       }
	       {
		    V T15, T16, T19, T1c;
		    T15 = VADD(TL, TS);
		    T16 = VBYI(VADD(TW, T13));
		    ST(&(xo[WS(os, 13)]), VSUB(T15, T16), ovs, &(xo[WS(os, 1)]));
		    ST(&(xo[WS(os, 2)]), VADD(T15, T16), ovs, &(xo[0]));
		    T19 = VADD(T17, T18);
		    T1c = VBYI(VSUB(T1a, T1b));
		    ST(&(xo[WS(os, 14)]), VSUB(T19, T1c), ovs, &(xo[0]));
		    ST(&(xo[WS(os, 1)]), VADD(T19, T1c), ovs, &(xo[WS(os, 1)]));
	       }
	  }
     }
     END_SIMD();
}
示例#16
0
static void n1fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP250000000, +0.250000000000000000000000000000000000000000000);
     DVK(KP559016994, +0.559016994374947424102293417182819058860154590);
     DVK(KP587785252, +0.587785252292473129168705954639072768597652438);
     DVK(KP951056516, +0.951056516295153572116439333379382143405698634);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th;
	  Tg = LD(&(xi[0]), ivs, &(xi[0]));
	  Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
	  Ti = VSUB(Tg, Th);
	  Ty = VADD(Tg, Th);
	  {
	       V T3, Tu, Td, Ts, T6, Tv, Ta, Tr;
	       {
		    V T1, T2, Tb, Tc;
		    T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    T3 = VSUB(T1, T2);
		    Tu = VADD(T1, T2);
		    Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
		    Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    Td = VSUB(Tb, Tc);
		    Ts = VADD(Tb, Tc);
	       }
	       {
		    V T4, T5, T8, T9;
		    T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
		    T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
		    T6 = VSUB(T4, T5);
		    Tv = VADD(T4, T5);
		    T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
		    T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
		    Ta = VSUB(T8, T9);
		    Tr = VADD(T8, T9);
	       }
	       Tm = VSUB(T3, T6);
	       Tn = VSUB(Ta, Td);
	       Tw = VSUB(Tu, Tv);
	       Tt = VSUB(Tr, Ts);
	       Tz = VADD(Tu, Tv);
	       TA = VADD(Tr, Ts);
	       TB = VADD(Tz, TA);
	       T7 = VADD(T3, T6);
	       Te = VADD(Ta, Td);
	       Tj = VADD(T7, Te);
	  }
	  ST(&(xo[WS(os, 5)]), VADD(Ti, Tj), ovs, &(xo[WS(os, 1)]));
	  ST(&(xo[0]), VADD(Ty, TB), ovs, &(xo[0]));
	  {
	       V To, Tq, Tl, Tp, Tf, Tk;
	       To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn)));
	       Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn)));
	       Tf = VMUL(LDK(KP559016994), VSUB(T7, Te));
	       Tk = VFNMS(LDK(KP250000000), Tj, Ti);
	       Tl = VADD(Tf, Tk);
	       Tp = VSUB(Tk, Tf);
	       ST(&(xo[WS(os, 1)]), VSUB(Tl, To), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 7)]), VADD(Tq, Tp), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 9)]), VADD(To, Tl), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 3)]), VSUB(Tp, Tq), ovs, &(xo[WS(os, 1)]));
	  }
	  {
	       V Tx, TF, TE, TG, TC, TD;
	       Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt)));
	       TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt)));
	       TC = VFNMS(LDK(KP250000000), TB, Ty);
	       TD = VMUL(LDK(KP559016994), VSUB(Tz, TA));
	       TE = VSUB(TC, TD);
	       TG = VADD(TD, TC);
	       ST(&(xo[WS(os, 2)]), VADD(Tx, TE), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 6)]), VSUB(TG, TF), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 8)]), VSUB(TE, Tx), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 4)]), VADD(TF, TG), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
示例#17
0
static void n2fv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP2_000000000, +2.000000000000000000000000000000000000000000000);
     DVK(KP083333333, +0.083333333333333333333333333333333333333333333);
     DVK(KP075902986, +0.075902986037193865983102897245103540356428373);
     DVK(KP251768516, +0.251768516431883313623436926934233488546674281);
     DVK(KP132983124, +0.132983124607418643793760531921092974399165133);
     DVK(KP258260390, +0.258260390311744861420450644284508567852516811);
     DVK(KP1_732050807, +1.732050807568877293527446341505872366942805254);
     DVK(KP300238635, +0.300238635966332641462884626667381504676006424);
     DVK(KP011599105, +0.011599105605768290721655456654083252189827041);
     DVK(KP156891391, +0.156891391051584611046832726756003269660212636);
     DVK(KP256247671, +0.256247671582936600958684654061725059144125175);
     DVK(KP174138601, +0.174138601152135905005660794929264742616964676);
     DVK(KP575140729, +0.575140729474003121368385547455453388461001608);
     DVK(KP503537032, +0.503537032863766627246873853868466977093348562);
     DVK(KP113854479, +0.113854479055790798974654345867655310534642560);
     DVK(KP265966249, +0.265966249214837287587521063842185948798330267);
     DVK(KP387390585, +0.387390585467617292130675966426762851778775217);
     DVK(KP300462606, +0.300462606288665774426601772289207995520941381);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V TW, Tb, Tm, Tu, TC, TR, TX, TK, TU, Tz, TB, TN, TT;
	  TW = LD(&(xi[0]), ivs, &(xi[0]));
	  {
	       V T3, TH, Tl, Tw, Tp, Tg, Tv, To, T6, Tr, T9, Ts, Ta, TI, T1;
	       V T2, Tq, Tt;
	       T1 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
	       T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
	       T3 = VSUB(T1, T2);
	       TH = VADD(T1, T2);
	       {
		    V Th, Ti, Tj, Tk;
		    Th = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
		    Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
		    Tj = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
		    Tk = VADD(Ti, Tj);
		    Tl = VADD(Th, Tk);
		    Tw = VSUB(Ti, Tj);
		    Tp = VFNMS(LDK(KP500000000), Tk, Th);
	       }
	       {
		    V Tc, Td, Te, Tf;
		    Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    Td = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
		    Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
		    Tf = VADD(Td, Te);
		    Tg = VADD(Tc, Tf);
		    Tv = VSUB(Td, Te);
		    To = VFNMS(LDK(KP500000000), Tf, Tc);
	       }
	       {
		    V T4, T5, T7, T8;
		    T4 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
		    T5 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
		    T6 = VSUB(T4, T5);
		    Tr = VADD(T4, T5);
		    T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    T9 = VSUB(T7, T8);
		    Ts = VADD(T7, T8);
	       }
	       Ta = VADD(T6, T9);
	       TI = VADD(Tr, Ts);
	       Tb = VADD(T3, Ta);
	       Tm = VSUB(Tg, Tl);
	       Tq = VSUB(To, Tp);
	       Tt = VMUL(LDK(KP866025403), VSUB(Tr, Ts));
	       Tu = VADD(Tq, Tt);
	       TC = VSUB(Tq, Tt);
	       {
		    V TP, TQ, TG, TJ;
		    TP = VADD(Tg, Tl);
		    TQ = VADD(TH, TI);
		    TR = VMUL(LDK(KP300462606), VSUB(TP, TQ));
		    TX = VADD(TP, TQ);
		    TG = VADD(To, Tp);
		    TJ = VFNMS(LDK(KP500000000), TI, TH);
		    TK = VSUB(TG, TJ);
		    TU = VADD(TG, TJ);
	       }
	       {
		    V Tx, Ty, TL, TM;
		    Tx = VMUL(LDK(KP866025403), VSUB(Tv, Tw));
		    Ty = VFNMS(LDK(KP500000000), Ta, T3);
		    Tz = VSUB(Tx, Ty);
		    TB = VADD(Tx, Ty);
		    TL = VADD(Tv, Tw);
		    TM = VSUB(T6, T9);
		    TN = VSUB(TL, TM);
		    TT = VADD(TL, TM);
	       }
	  }
	  ST(&(xo[0]), VADD(TW, TX), ovs, &(xo[0]));
	  {
	       V T19, T1n, T14, T13, T1f, T1k, Tn, TE, T1e, T1j, TS, T1m, TZ, T1c, TA;
	       V TD;
	       {
		    V T17, T18, T11, T12;
		    T17 = VFMA(LDK(KP387390585), TN, VMUL(LDK(KP265966249), TK));
		    T18 = VFNMS(LDK(KP503537032), TU, VMUL(LDK(KP113854479), TT));
		    T19 = VSUB(T17, T18);
		    T1n = VADD(T17, T18);
		    T14 = VFMA(LDK(KP575140729), Tm, VMUL(LDK(KP174138601), Tb));
		    T11 = VFNMS(LDK(KP156891391), TB, VMUL(LDK(KP256247671), TC));
		    T12 = VFMA(LDK(KP011599105), Tz, VMUL(LDK(KP300238635), Tu));
		    T13 = VSUB(T11, T12);
		    T1f = VADD(T14, T13);
		    T1k = VMUL(LDK(KP1_732050807), VADD(T11, T12));
	       }
	       Tn = VFNMS(LDK(KP174138601), Tm, VMUL(LDK(KP575140729), Tb));
	       TA = VFNMS(LDK(KP300238635), Tz, VMUL(LDK(KP011599105), Tu));
	       TD = VFMA(LDK(KP256247671), TB, VMUL(LDK(KP156891391), TC));
	       TE = VSUB(TA, TD);
	       T1e = VMUL(LDK(KP1_732050807), VADD(TD, TA));
	       T1j = VSUB(Tn, TE);
	       {
		    V TO, T1b, TV, TY, T1a;
		    TO = VFNMS(LDK(KP132983124), TN, VMUL(LDK(KP258260390), TK));
		    T1b = VSUB(TR, TO);
		    TV = VFMA(LDK(KP251768516), TT, VMUL(LDK(KP075902986), TU));
		    TY = VFNMS(LDK(KP083333333), TX, TW);
		    T1a = VSUB(TY, TV);
		    TS = VFMA(LDK(KP2_000000000), TO, TR);
		    T1m = VADD(T1b, T1a);
		    TZ = VFMA(LDK(KP2_000000000), TV, TY);
		    T1c = VSUB(T1a, T1b);
	       }
	       {
		    V TF, T10, T1l, T1o;
		    TF = VBYI(VFMA(LDK(KP2_000000000), TE, Tn));
		    T10 = VADD(TS, TZ);
		    ST(&(xo[2]), VADD(TF, T10), ovs, &(xo[2]));
		    ST(&(xo[24]), VSUB(T10, TF), ovs, &(xo[0]));
		    {
			 V T15, T16, T1p, T1q;
			 T15 = VBYI(VFMS(LDK(KP2_000000000), T13, T14));
			 T16 = VSUB(TZ, TS);
			 ST(&(xo[10]), VADD(T15, T16), ovs, &(xo[2]));
			 ST(&(xo[16]), VSUB(T16, T15), ovs, &(xo[0]));
			 T1p = VADD(T1n, T1m);
			 T1q = VBYI(VADD(T1j, T1k));
			 ST(&(xo[8]), VSUB(T1p, T1q), ovs, &(xo[0]));
			 ST(&(xo[18]), VADD(T1q, T1p), ovs, &(xo[2]));
		    }
		    T1l = VBYI(VSUB(T1j, T1k));
		    T1o = VSUB(T1m, T1n);
		    ST(&(xo[6]), VADD(T1l, T1o), ovs, &(xo[2]));
		    ST(&(xo[20]), VSUB(T1o, T1l), ovs, &(xo[0]));
		    {
			 V T1h, T1i, T1d, T1g;
			 T1h = VBYI(VSUB(T1e, T1f));
			 T1i = VSUB(T1c, T19);
			 ST(&(xo[12]), VADD(T1h, T1i), ovs, &(xo[0]));
			 ST(&(xo[14]), VSUB(T1i, T1h), ovs, &(xo[2]));
			 T1d = VADD(T19, T1c);
			 T1g = VBYI(VADD(T1e, T1f));
			 ST(&(xo[4]), VSUB(T1d, T1g), ovs, &(xo[0]));
			 ST(&(xo[22]), VADD(T1g, T1d), ovs, &(xo[2]));
		    }
	       }
	  }
     }
     END_SIMD();
}
示例#18
0
static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP959492973, +0.959492973614497389890368057066327699062454848);
     DVK(KP654860733, +0.654860733945285064056925072466293553183791199);
     DVK(KP142314838, +0.142314838273285140443792668616369668791051361);
     DVK(KP415415013, +0.415415013001886425529274149229623203524004910);
     DVK(KP841253532, +0.841253532831181168861811648919367717513292498);
     DVK(KP540640817, +0.540640817455597582107635954318691695431770608);
     DVK(KP909631995, +0.909631995354518371411715383079028460060241051);
     DVK(KP989821441, +0.989821441880932732376092037776718787376519372);
     DVK(KP755749574, +0.755749574354258283774035843972344420179717445);
     DVK(KP281732556, +0.281732556841429697711417915346616899035777899);
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V Th, T3, Tm, Tf, Ti, Tc, Tj, T9, Tk, T6, Tl, Ta, Tb, Ts, Tt;
	  Th = LD(&(xi[0]), ivs, &(xi[0]));
	  {
	       V T1, T2, Td, Te;
	       T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
	       T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
	       T3 = VSUB(T1, T2);
	       Tm = VADD(T1, T2);
	       Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
	       Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
	       Tf = VSUB(Td, Te);
	       Ti = VADD(Td, Te);
	  }
	  Ta = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
	  Tb = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
	  Tc = VSUB(Ta, Tb);
	  Tj = VADD(Ta, Tb);
	  {
	       V T7, T8, T4, T5;
	       T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
	       T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
	       T9 = VSUB(T7, T8);
	       Tk = VADD(T7, T8);
	       T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       T5 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
	       T6 = VSUB(T4, T5);
	       Tl = VADD(T4, T5);
	  }
	  ST(&(xo[0]), VADD(Th, VADD(Tm, VADD(Ti, VADD(Tl, VADD(Tj, Tk))))), ovs, &(xo[0]));
	  {
	       V Tg, Tn, Tu, Tv;
	       Tg = VBYI(VFMA(LDK(KP281732556), T3, VFMA(LDK(KP755749574), T6, VFNMS(LDK(KP909631995), Tc, VFNMS(LDK(KP540640817), Tf, VMUL(LDK(KP989821441), T9))))));
	       Tn = VFMA(LDK(KP841253532), Ti, VFMA(LDK(KP415415013), Tj, VFNMS(LDK(KP142314838), Tk, VFNMS(LDK(KP654860733), Tl, VFNMS(LDK(KP959492973), Tm, Th)))));
	       ST(&(xo[WS(os, 5)]), VADD(Tg, Tn), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 6)]), VSUB(Tn, Tg), ovs, &(xo[0]));
	       Tu = VBYI(VFMA(LDK(KP755749574), T3, VFMA(LDK(KP540640817), T6, VFNMS(LDK(KP909631995), T9, VFNMS(LDK(KP989821441), Tf, VMUL(LDK(KP281732556), Tc))))));
	       Tv = VFMA(LDK(KP841253532), Tl, VFMA(LDK(KP415415013), Tk, VFNMS(LDK(KP959492973), Tj, VFNMS(LDK(KP142314838), Ti, VFNMS(LDK(KP654860733), Tm, Th)))));
	       ST(&(xo[WS(os, 4)]), VADD(Tu, Tv), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 7)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)]));
	  }
	  Ts = VBYI(VFMA(LDK(KP909631995), T3, VFNMS(LDK(KP540640817), T9, VFNMS(LDK(KP989821441), Tc, VFNMS(LDK(KP281732556), T6, VMUL(LDK(KP755749574), Tf))))));
	  Tt = VFMA(LDK(KP415415013), Tm, VFMA(LDK(KP841253532), Tk, VFNMS(LDK(KP142314838), Tj, VFNMS(LDK(KP959492973), Tl, VFNMS(LDK(KP654860733), Ti, Th)))));
	  ST(&(xo[WS(os, 2)]), VADD(Ts, Tt), ovs, &(xo[0]));
	  ST(&(xo[WS(os, 9)]), VSUB(Tt, Ts), ovs, &(xo[WS(os, 1)]));
	  {
	       V Tq, Tr, To, Tp;
	       Tq = VBYI(VFMA(LDK(KP540640817), T3, VFMA(LDK(KP909631995), Tf, VFMA(LDK(KP989821441), T6, VFMA(LDK(KP755749574), Tc, VMUL(LDK(KP281732556), T9))))));
	       Tr = VFMA(LDK(KP841253532), Tm, VFMA(LDK(KP415415013), Ti, VFNMS(LDK(KP959492973), Tk, VFNMS(LDK(KP654860733), Tj, VFNMS(LDK(KP142314838), Tl, Th)))));
	       ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 10)]), VSUB(Tr, Tq), ovs, &(xo[0]));
	       To = VBYI(VFMA(LDK(KP989821441), T3, VFMA(LDK(KP540640817), Tc, VFNMS(LDK(KP909631995), T6, VFNMS(LDK(KP281732556), Tf, VMUL(LDK(KP755749574), T9))))));
	       Tp = VFMA(LDK(KP415415013), Tl, VFMA(LDK(KP841253532), Tj, VFNMS(LDK(KP654860733), Tk, VFNMS(LDK(KP959492973), Ti, VFNMS(LDK(KP142314838), Tm, Th)))));
	       ST(&(xo[WS(os, 3)]), VADD(To, Tp), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 8)]), VSUB(Tp, To), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
示例#19
0
static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu;
	  {
	       V T1, T6, T4, Tw, T9, Tx;
	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
	       T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
	       {
		    V T2, T3, T7, T8;
		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
		    T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
		    T4 = VADD(T2, T3);
		    Tw = VSUB(T3, T2);
		    T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    T9 = VADD(T7, T8);
		    Tx = VSUB(T8, T7);
	       }
	       T5 = VADD(T1, T4);
	       Ta = VADD(T6, T9);
	       TJ = VADD(Tw, Tx);
	       Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx));
	       Tq = VFNMS(LDK(KP500000000), T9, T6);
	       Tp = VFNMS(LDK(KP500000000), T4, T1);
	  }
	  {
	       V Tc, Th, Tf, Ts, Tk, Tt;
	       Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
	       {
		    V Td, Te, Ti, Tj;
		    Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
		    Tf = VADD(Td, Te);
		    Ts = VSUB(Te, Td);
		    Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
		    Tk = VADD(Ti, Tj);
		    Tt = VSUB(Tj, Ti);
	       }
	       Tg = VADD(Tc, Tf);
	       Tl = VADD(Th, Tk);
	       TI = VADD(Ts, Tt);
	       TA = VFNMS(LDK(KP500000000), Tk, Th);
	       Tz = VFNMS(LDK(KP500000000), Tf, Tc);
	       Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt));
	  }
	  {
	       V Tb, Tm, Tn, To;
	       Tb = VSUB(T5, Ta);
	       Tm = VBYI(VSUB(Tg, Tl));
	       ST(&(xo[18]), VSUB(Tb, Tm), ovs, &(xo[2]));
	       ST(&(xo[6]), VADD(Tb, Tm), ovs, &(xo[2]));
	       Tn = VADD(T5, Ta);
	       To = VADD(Tg, Tl);
	       ST(&(xo[12]), VSUB(Tn, To), ovs, &(xo[0]));
	       ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0]));
	  }
	  {
	       V Tv, TE, TC, TD, Tr, TB;
	       Tr = VSUB(Tp, Tq);
	       Tv = VSUB(Tr, Tu);
	       TE = VADD(Tr, Tu);
	       TB = VSUB(Tz, TA);
	       TC = VBYI(VADD(Ty, TB));
	       TD = VBYI(VSUB(Ty, TB));
	       ST(&(xo[10]), VSUB(Tv, TC), ovs, &(xo[2]));
	       ST(&(xo[22]), VSUB(TE, TD), ovs, &(xo[2]));
	       ST(&(xo[14]), VADD(TC, Tv), ovs, &(xo[2]));
	       ST(&(xo[2]), VADD(TD, TE), ovs, &(xo[2]));
	  }
	  {
	       V TK, TM, TH, TL, TF, TG;
	       TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
	       TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
	       TF = VADD(Tp, Tq);
	       TG = VADD(Tz, TA);
	       TH = VSUB(TF, TG);
	       TL = VADD(TF, TG);
	       ST(&(xo[20]), VSUB(TH, TK), ovs, &(xo[0]));
	       ST(&(xo[8]), VADD(TL, TM), ovs, &(xo[0]));
	       ST(&(xo[4]), VADD(TH, TK), ovs, &(xo[0]));
	       ST(&(xo[16]), VSUB(TL, TM), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
示例#20
0
static const R *t1bv_32(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP195090322, +0.195090322016128267848284868477022240927691618);
     DVK(KP980785280, +0.980785280403230449126182236134239036973933731);
     DVK(KP555570233, +0.555570233019602224742830813948532874374937191);
     DVK(KP831469612, +0.831469612302545237078788377617905756738560812);
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
     int i;
     R *x;
     x = ii;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 62)) {
	  V T4, T1D, T2P, T3h, Tf, T1y, T2K, T3i, TC, T1w, T2G, T3e, Tr, T1v, T2D;
	  V T3d, T1k, T20, T2y, T3a, T1r, T21, T2v, T39, TV, T1X, T2r, T37, T12, T1Y;
	  V T2o, T36;
	  {
	       V T1, T1C, T3, T1A, T1B, T2, T1z, T2N, T2O;
	       T1 = LD(&(x[0]), dist, &(x[0]));
	       T1B = LD(&(x[WS(ios, 24)]), dist, &(x[0]));
	       T1C = BYTW(&(W[TWVL * 46]), T1B);
	       T2 = LD(&(x[WS(ios, 16)]), dist, &(x[0]));
	       T3 = BYTW(&(W[TWVL * 30]), T2);
	       T1z = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
	       T1A = BYTW(&(W[TWVL * 14]), T1z);
	       T4 = VSUB(T1, T3);
	       T1D = VSUB(T1A, T1C);
	       T2N = VADD(T1, T3);
	       T2O = VADD(T1A, T1C);
	       T2P = VSUB(T2N, T2O);
	       T3h = VADD(T2N, T2O);
	  }
	  {
	       V T6, Td, T8, Tb;
	       {
		    V T5, Tc, T7, Ta;
		    T5 = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
		    T6 = BYTW(&(W[TWVL * 6]), T5);
		    Tc = LD(&(x[WS(ios, 12)]), dist, &(x[0]));
		    Td = BYTW(&(W[TWVL * 22]), Tc);
		    T7 = LD(&(x[WS(ios, 20)]), dist, &(x[0]));
		    T8 = BYTW(&(W[TWVL * 38]), T7);
		    Ta = LD(&(x[WS(ios, 28)]), dist, &(x[0]));
		    Tb = BYTW(&(W[TWVL * 54]), Ta);
	       }
	       {
		    V T9, Te, T2I, T2J;
		    T9 = VSUB(T6, T8);
		    Te = VSUB(Tb, Td);
		    Tf = VMUL(LDK(KP707106781), VADD(T9, Te));
		    T1y = VMUL(LDK(KP707106781), VSUB(T9, Te));
		    T2I = VADD(T6, T8);
		    T2J = VADD(Tb, Td);
		    T2K = VSUB(T2I, T2J);
		    T3i = VADD(T2I, T2J);
	       }
	  }
	  {
	       V Tt, TA, Tv, Ty;
	       {
		    V Ts, Tz, Tu, Tx;
		    Ts = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
		    Tt = BYTW(&(W[TWVL * 10]), Ts);
		    Tz = LD(&(x[WS(ios, 14)]), dist, &(x[0]));
		    TA = BYTW(&(W[TWVL * 26]), Tz);
		    Tu = LD(&(x[WS(ios, 22)]), dist, &(x[0]));
		    Tv = BYTW(&(W[TWVL * 42]), Tu);
		    Tx = LD(&(x[WS(ios, 30)]), dist, &(x[0]));
		    Ty = BYTW(&(W[TWVL * 58]), Tx);
	       }
	       {
		    V Tw, TB, T2E, T2F;
		    Tw = VSUB(Tt, Tv);
		    TB = VSUB(Ty, TA);
		    TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw));
		    T1w = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw));
		    T2E = VADD(Ty, TA);
		    T2F = VADD(Tt, Tv);
		    T2G = VSUB(T2E, T2F);
		    T3e = VADD(T2E, T2F);
	       }
	  }
	  {
	       V Ti, Tp, Tk, Tn;
	       {
		    V Th, To, Tj, Tm;
		    Th = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
		    Ti = BYTW(&(W[TWVL * 2]), Th);
		    To = LD(&(x[WS(ios, 26)]), dist, &(x[0]));
		    Tp = BYTW(&(W[TWVL * 50]), To);
		    Tj = LD(&(x[WS(ios, 18)]), dist, &(x[0]));
		    Tk = BYTW(&(W[TWVL * 34]), Tj);
		    Tm = LD(&(x[WS(ios, 10)]), dist, &(x[0]));
		    Tn = BYTW(&(W[TWVL * 18]), Tm);
	       }
	       {
		    V Tl, Tq, T2B, T2C;
		    Tl = VSUB(Ti, Tk);
		    Tq = VSUB(Tn, Tp);
		    Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq));
		    T1v = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl));
		    T2B = VADD(Ti, Tk);
		    T2C = VADD(Tn, Tp);
		    T2D = VSUB(T2B, T2C);
		    T3d = VADD(T2B, T2C);
	       }
	  }
	  {
	       V T1g, T1i, T1o, T1m, T1a, T1c, T1d, T15, T17, T18;
	       {
		    V T1f, T1h, T1n, T1l;
		    T1f = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
		    T1g = BYTW(&(W[TWVL * 12]), T1f);
		    T1h = LD(&(x[WS(ios, 23)]), dist, &(x[WS(ios, 1)]));
		    T1i = BYTW(&(W[TWVL * 44]), T1h);
		    T1n = LD(&(x[WS(ios, 15)]), dist, &(x[WS(ios, 1)]));
		    T1o = BYTW(&(W[TWVL * 28]), T1n);
		    T1l = LD(&(x[WS(ios, 31)]), dist, &(x[WS(ios, 1)]));
		    T1m = BYTW(&(W[TWVL * 60]), T1l);
		    {
			 V T19, T1b, T14, T16;
			 T19 = LD(&(x[WS(ios, 27)]), dist, &(x[WS(ios, 1)]));
			 T1a = BYTW(&(W[TWVL * 52]), T19);
			 T1b = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)]));
			 T1c = BYTW(&(W[TWVL * 20]), T1b);
			 T1d = VSUB(T1a, T1c);
			 T14 = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
			 T15 = BYTW(&(W[TWVL * 4]), T14);
			 T16 = LD(&(x[WS(ios, 19)]), dist, &(x[WS(ios, 1)]));
			 T17 = BYTW(&(W[TWVL * 36]), T16);
			 T18 = VSUB(T15, T17);
		    }
	       }
	       {
		    V T1e, T1j, T2w, T2x;
		    T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d));
		    T1j = VSUB(T1g, T1i);
		    T1k = VSUB(T1e, T1j);
		    T20 = VADD(T1j, T1e);
		    T2w = VADD(T15, T17);
		    T2x = VADD(T1a, T1c);
		    T2y = VSUB(T2w, T2x);
		    T3a = VADD(T2w, T2x);
	       }
	       {
		    V T1p, T1q, T2t, T2u;
		    T1p = VSUB(T1m, T1o);
		    T1q = VMUL(LDK(KP707106781), VADD(T18, T1d));
		    T1r = VSUB(T1p, T1q);
		    T21 = VADD(T1p, T1q);
		    T2t = VADD(T1m, T1o);
		    T2u = VADD(T1g, T1i);
		    T2v = VSUB(T2t, T2u);
		    T39 = VADD(T2t, T2u);
	       }
	  }
	  {
	       V TR, TT, TZ, TX, TL, TN, TO, TG, TI, TJ;
	       {
		    V TQ, TS, TY, TW;
		    TQ = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
		    TR = BYTW(&(W[TWVL * 16]), TQ);
		    TS = LD(&(x[WS(ios, 25)]), dist, &(x[WS(ios, 1)]));
		    TT = BYTW(&(W[TWVL * 48]), TS);
		    TY = LD(&(x[WS(ios, 17)]), dist, &(x[WS(ios, 1)]));
		    TZ = BYTW(&(W[TWVL * 32]), TY);
		    TW = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
		    TX = BYTW(&(W[0]), TW);
		    {
			 V TK, TM, TF, TH;
			 TK = LD(&(x[WS(ios, 29)]), dist, &(x[WS(ios, 1)]));
			 TL = BYTW(&(W[TWVL * 56]), TK);
			 TM = LD(&(x[WS(ios, 13)]), dist, &(x[WS(ios, 1)]));
			 TN = BYTW(&(W[TWVL * 24]), TM);
			 TO = VSUB(TL, TN);
			 TF = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
			 TG = BYTW(&(W[TWVL * 8]), TF);
			 TH = LD(&(x[WS(ios, 21)]), dist, &(x[WS(ios, 1)]));
			 TI = BYTW(&(W[TWVL * 40]), TH);
			 TJ = VSUB(TG, TI);
		    }
	       }
	       {
		    V TP, TU, T2p, T2q;
		    TP = VMUL(LDK(KP707106781), VSUB(TJ, TO));
		    TU = VSUB(TR, TT);
		    TV = VSUB(TP, TU);
		    T1X = VADD(TU, TP);
		    T2p = VADD(TG, TI);
		    T2q = VADD(TL, TN);
		    T2r = VSUB(T2p, T2q);
		    T37 = VADD(T2p, T2q);
	       }
	       {
		    V T10, T11, T2m, T2n;
		    T10 = VSUB(TX, TZ);
		    T11 = VMUL(LDK(KP707106781), VADD(TJ, TO));
		    T12 = VSUB(T10, T11);
		    T1Y = VADD(T10, T11);
		    T2m = VADD(TX, TZ);
		    T2n = VADD(TR, TT);
		    T2o = VSUB(T2m, T2n);
		    T36 = VADD(T2m, T2n);
	       }
	  }
	  {
	       V T3q, T3u, T3t, T3v;
	       {
		    V T3o, T3p, T3r, T3s;
		    T3o = VADD(T3h, T3i);
		    T3p = VADD(T3d, T3e);
		    T3q = VSUB(T3o, T3p);
		    T3u = VADD(T3o, T3p);
		    T3r = VADD(T36, T37);
		    T3s = VADD(T39, T3a);
		    T3t = VBYI(VSUB(T3r, T3s));
		    T3v = VADD(T3r, T3s);
	       }
	       ST(&(x[WS(ios, 24)]), VSUB(T3q, T3t), dist, &(x[0]));
	       ST(&(x[0]), VADD(T3u, T3v), dist, &(x[0]));
	       ST(&(x[WS(ios, 8)]), VADD(T3q, T3t), dist, &(x[0]));
	       ST(&(x[WS(ios, 16)]), VSUB(T3u, T3v), dist, &(x[0]));
	  }
	  {
	       V T3f, T3j, T3c, T3k, T38, T3b;
	       T3f = VSUB(T3d, T3e);
	       T3j = VSUB(T3h, T3i);
	       T38 = VSUB(T36, T37);
	       T3b = VSUB(T39, T3a);
	       T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b));
	       T3k = VMUL(LDK(KP707106781), VADD(T38, T3b));
	       {
		    V T3g, T3l, T3m, T3n;
		    T3g = VBYI(VSUB(T3c, T3f));
		    T3l = VSUB(T3j, T3k);
		    ST(&(x[WS(ios, 12)]), VADD(T3g, T3l), dist, &(x[0]));
		    ST(&(x[WS(ios, 20)]), VSUB(T3l, T3g), dist, &(x[0]));
		    T3m = VBYI(VADD(T3f, T3c));
		    T3n = VADD(T3j, T3k);
		    ST(&(x[WS(ios, 4)]), VADD(T3m, T3n), dist, &(x[0]));
		    ST(&(x[WS(ios, 28)]), VSUB(T3n, T3m), dist, &(x[0]));
	       }
	  }
	  {
	       V T2L, T31, T2R, T2Y, T2A, T2Z, T2U, T32, T2H, T2Q;
	       T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G));
	       T2L = VSUB(T2H, T2K);
	       T31 = VADD(T2K, T2H);
	       T2Q = VMUL(LDK(KP707106781), VADD(T2D, T2G));
	       T2R = VSUB(T2P, T2Q);
	       T2Y = VADD(T2P, T2Q);
	       {
		    V T2s, T2z, T2S, T2T;
		    T2s = VFNMS(LDK(KP382683432), T2r, VMUL(LDK(KP923879532), T2o));
		    T2z = VFMA(LDK(KP923879532), T2v, VMUL(LDK(KP382683432), T2y));
		    T2A = VSUB(T2s, T2z);
		    T2Z = VADD(T2s, T2z);
		    T2S = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2r));
		    T2T = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2y));
		    T2U = VSUB(T2S, T2T);
		    T32 = VADD(T2S, T2T);
	       }
	       {
		    V T2M, T2V, T34, T35;
		    T2M = VBYI(VSUB(T2A, T2L));
		    T2V = VSUB(T2R, T2U);
		    ST(&(x[WS(ios, 10)]), VADD(T2M, T2V), dist, &(x[0]));
		    ST(&(x[WS(ios, 22)]), VSUB(T2V, T2M), dist, &(x[0]));
		    T34 = VSUB(T2Y, T2Z);
		    T35 = VBYI(VSUB(T32, T31));
		    ST(&(x[WS(ios, 18)]), VSUB(T34, T35), dist, &(x[0]));
		    ST(&(x[WS(ios, 14)]), VADD(T34, T35), dist, &(x[0]));
	       }
	       {
		    V T2W, T2X, T30, T33;
		    T2W = VBYI(VADD(T2L, T2A));
		    T2X = VADD(T2R, T2U);
		    ST(&(x[WS(ios, 6)]), VADD(T2W, T2X), dist, &(x[0]));
		    ST(&(x[WS(ios, 26)]), VSUB(T2X, T2W), dist, &(x[0]));
		    T30 = VADD(T2Y, T2Z);
		    T33 = VBYI(VADD(T31, T32));
		    ST(&(x[WS(ios, 30)]), VSUB(T30, T33), dist, &(x[0]));
		    ST(&(x[WS(ios, 2)]), VADD(T30, T33), dist, &(x[0]));
	       }
	  }
	  {
	       V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N;
	       {
		    V Tg, TD, T1G, T1H;
		    Tg = VSUB(T4, Tf);
		    TD = VSUB(Tr, TC);
		    TE = VSUB(Tg, TD);
		    T1P = VADD(Tg, TD);
		    T1G = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12));
		    T1H = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r));
		    T1I = VSUB(T1G, T1H);
		    T1Q = VADD(T1G, T1H);
	       }
	       {
		    V T13, T1s, T1x, T1E;
		    T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12));
		    T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k));
		    T1t = VSUB(T13, T1s);
		    T1M = VADD(T13, T1s);
		    T1x = VSUB(T1v, T1w);
		    T1E = VSUB(T1y, T1D);
		    T1F = VSUB(T1x, T1E);
		    T1N = VADD(T1E, T1x);
	       }
	       {
		    V T1u, T1J, T1S, T1T;
		    T1u = VADD(TE, T1t);
		    T1J = VBYI(VADD(T1F, T1I));
		    ST(&(x[WS(ios, 27)]), VSUB(T1u, T1J), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 5)]), VADD(T1u, T1J), dist, &(x[WS(ios, 1)]));
		    T1S = VBYI(VADD(T1N, T1M));
		    T1T = VADD(T1P, T1Q);
		    ST(&(x[WS(ios, 3)]), VADD(T1S, T1T), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 29)]), VSUB(T1T, T1S), dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T1K, T1L, T1O, T1R;
		    T1K = VSUB(TE, T1t);
		    T1L = VBYI(VSUB(T1I, T1F));
		    ST(&(x[WS(ios, 21)]), VSUB(T1K, T1L), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 11)]), VADD(T1K, T1L), dist, &(x[WS(ios, 1)]));
		    T1O = VBYI(VSUB(T1M, T1N));
		    T1R = VSUB(T1P, T1Q);
		    ST(&(x[WS(ios, 13)]), VADD(T1O, T1R), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 19)]), VSUB(T1R, T1O), dist, &(x[WS(ios, 1)]));
	       }
	  }
	  {
	       V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f;
	       {
		    V T1U, T1V, T28, T29;
		    T1U = VADD(T4, Tf);
		    T1V = VADD(T1v, T1w);
		    T1W = VSUB(T1U, T1V);
		    T2h = VADD(T1U, T1V);
		    T28 = VFNMS(LDK(KP195090322), T1X, VMUL(LDK(KP980785280), T1Y));
		    T29 = VFMA(LDK(KP195090322), T20, VMUL(LDK(KP980785280), T21));
		    T2a = VSUB(T28, T29);
		    T2i = VADD(T28, T29);
	       }
	       {
		    V T1Z, T22, T25, T26;
		    T1Z = VFMA(LDK(KP980785280), T1X, VMUL(LDK(KP195090322), T1Y));
		    T22 = VFNMS(LDK(KP195090322), T21, VMUL(LDK(KP980785280), T20));
		    T23 = VSUB(T1Z, T22);
		    T2e = VADD(T1Z, T22);
		    T25 = VADD(Tr, TC);
		    T26 = VADD(T1D, T1y);
		    T27 = VSUB(T25, T26);
		    T2f = VADD(T26, T25);
	       }
	       {
		    V T24, T2b, T2k, T2l;
		    T24 = VADD(T1W, T23);
		    T2b = VBYI(VADD(T27, T2a));
		    ST(&(x[WS(ios, 25)]), VSUB(T24, T2b), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 7)]), VADD(T24, T2b), dist, &(x[WS(ios, 1)]));
		    T2k = VBYI(VADD(T2f, T2e));
		    T2l = VADD(T2h, T2i);
		    ST(&(x[WS(ios, 1)]), VADD(T2k, T2l), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 31)]), VSUB(T2l, T2k), dist, &(x[WS(ios, 1)]));
	       }
	       {
		    V T2c, T2d, T2g, T2j;
		    T2c = VSUB(T1W, T23);
		    T2d = VBYI(VSUB(T2a, T27));
		    ST(&(x[WS(ios, 23)]), VSUB(T2c, T2d), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 9)]), VADD(T2c, T2d), dist, &(x[WS(ios, 1)]));
		    T2g = VBYI(VSUB(T2e, T2f));
		    T2j = VSUB(T2h, T2i);
		    ST(&(x[WS(ios, 15)]), VADD(T2g, T2j), dist, &(x[WS(ios, 1)]));
		    ST(&(x[WS(ios, 17)]), VSUB(T2j, T2g), dist, &(x[WS(ios, 1)]));
	       }
	  }
     }
     END_SIMD();
     return W;
}
示例#21
0
static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP923879532, +0.923879532511286756128183189396788286822416626);
     DVK(KP382683432, +0.382683432365089771728459984030398866761344562);
     DVK(KP707106781, +0.707106781186547524400844362104849039284835938);
     int i;
     const R *xi;
     R *xo;
     xi = ri;
     xo = ro;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx;
	  V TQ;
	  {
	       V Tn, To, TM, Ts, Tt, TL;
	       Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
	       To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
	       TM = VADD(Tn, To);
	       Ts = LD(&(xi[0]), ivs, &(xi[0]));
	       Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
	       TL = VADD(Ts, Tt);
	       Tp = VSUB(Tn, To);
	       T13 = VADD(TL, TM);
	       Tu = VSUB(Ts, Tt);
	       TN = VSUB(TL, TM);
	  }
	  {
	       V Ti, TW, Tl, TX;
	       {
		    V Tg, Th, Tj, Tk;
		    Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0]));
		    Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
		    Ti = VSUB(Tg, Th);
		    TW = VADD(Tg, Th);
		    Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
		    Tl = VSUB(Tj, Tk);
		    TX = VADD(Tj, Tk);
	       }
	       Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl));
	       T14 = VADD(TX, TW);
	       Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti));
	       TY = VSUB(TW, TX);
	  }
	  {
	       V T3, TR, T6, TS;
	       {
		    V T1, T2, T4, T5;
		    T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)]));
		    T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    T3 = VSUB(T1, T2);
		    TR = VADD(T1, T2);
		    T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
		    T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
		    T6 = VSUB(T4, T5);
		    TS = VADD(T4, T5);
	       }
	       T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3));
	       T17 = VADD(TR, TS);
	       Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6));
	       TT = VSUB(TR, TS);
	  }
	  {
	       V Ta, TO, Td, TP;
	       {
		    V T8, T9, Tb, Tc;
		    T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
		    Ta = VSUB(T8, T9);
		    TO = VADD(T8, T9);
		    Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
		    Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
		    Td = VSUB(Tb, Tc);
		    TP = VADD(Tb, Tc);
	       }
	       Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td));
	       T16 = VADD(TO, TP);
	       Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta));
	       TQ = VSUB(TO, TP);
	  }
	  {
	       V T15, T18, T19, T1a;
	       T15 = VADD(T13, T14);
	       T18 = VADD(T16, T17);
	       ST(&(xo[WS(os, 8)]), VSUB(T15, T18), ovs, &(xo[0]));
	       ST(&(xo[0]), VADD(T15, T18), ovs, &(xo[0]));
	       T19 = VSUB(T13, T14);
	       T1a = VBYI(VSUB(T17, T16));
	       ST(&(xo[WS(os, 12)]), VSUB(T19, T1a), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 4)]), VADD(T19, T1a), ovs, &(xo[0]));
	  }
	  {
	       V TV, T11, T10, T12, TU, TZ;
	       TU = VMUL(LDK(KP707106781), VADD(TQ, TT));
	       TV = VADD(TN, TU);
	       T11 = VSUB(TN, TU);
	       TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ));
	       T10 = VBYI(VADD(TY, TZ));
	       T12 = VBYI(VSUB(TZ, TY));
	       ST(&(xo[WS(os, 14)]), VSUB(TV, T10), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 6)]), VADD(T11, T12), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 2)]), VADD(TV, T10), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 10)]), VSUB(T11, T12), ovs, &(xo[0]));
	  }
	  {
	       V Tr, TB, TA, TC;
	       {
		    V Tf, Tq, Tw, Tz;
		    Tf = VSUB(T7, Te);
		    Tq = VSUB(Tm, Tp);
		    Tr = VBYI(VSUB(Tf, Tq));
		    TB = VBYI(VADD(Tq, Tf));
		    Tw = VADD(Tu, Tv);
		    Tz = VADD(Tx, Ty);
		    TA = VSUB(Tw, Tz);
		    TC = VADD(Tw, Tz);
	       }
	       ST(&(xo[WS(os, 7)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 15)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 9)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 1)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)]));
	  }
	  {
	       V TF, TJ, TI, TK;
	       {
		    V TD, TE, TG, TH;
		    TD = VSUB(Tu, Tv);
		    TE = VADD(Te, T7);
		    TF = VADD(TD, TE);
		    TJ = VSUB(TD, TE);
		    TG = VADD(Tp, Tm);
		    TH = VSUB(Ty, Tx);
		    TI = VBYI(VADD(TG, TH));
		    TK = VBYI(VSUB(TH, TG));
	       }
	       ST(&(xo[WS(os, 13)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 5)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 3)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 11)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)]));
	  }
     }
     END_SIMD();
}
示例#22
0
static void n1bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts;
	  {
	       V T1, T6, T4, Tk, T9, Tl;
	       T1 = LD(&(xi[0]), ivs, &(xi[0]));
	       T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
	       {
		    V T2, T3, T7, T8;
		    T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
		    T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
		    T4 = VADD(T2, T3);
		    Tk = VSUB(T2, T3);
		    T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
		    T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    T9 = VADD(T7, T8);
		    Tl = VSUB(T7, T8);
	       }
	       T5 = VFNMS(LDK(KP500000000), T4, T1);
	       Ta = VFNMS(LDK(KP500000000), T9, T6);
	       TG = VADD(T6, T9);
	       TF = VADD(T1, T4);
	       Ty = VADD(Tk, Tl);
	       Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl));
	  }
	  {
	       V Tn, Tq, Te, To, Th, Tr;
	       Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
	       Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
	       {
		    V Tc, Td, Tf, Tg;
		    Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
		    Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
		    Te = VSUB(Tc, Td);
		    To = VADD(Tc, Td);
		    Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
		    Th = VSUB(Tf, Tg);
		    Tr = VADD(Tf, Tg);
	       }
	       Ti = VMUL(LDK(KP866025403), VSUB(Te, Th));
	       Tp = VFNMS(LDK(KP500000000), To, Tn);
	       TJ = VADD(Tq, Tr);
	       TI = VADD(Tn, To);
	       Tx = VADD(Te, Th);
	       Ts = VFNMS(LDK(KP500000000), Tr, Tq);
	  }
	  {
	       V TH, TK, TL, TM;
	       TH = VSUB(TF, TG);
	       TK = VBYI(VSUB(TI, TJ));
	       ST(&(xo[WS(os, 3)]), VSUB(TH, TK), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 9)]), VADD(TH, TK), ovs, &(xo[WS(os, 1)]));
	       TL = VADD(TF, TG);
	       TM = VADD(TI, TJ);
	       ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0]));
	       ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0]));
	  }
	  {
	       V Tj, Tv, Tu, Tw, Tb, Tt;
	       Tb = VSUB(T5, Ta);
	       Tj = VSUB(Tb, Ti);
	       Tv = VADD(Tb, Ti);
	       Tt = VSUB(Tp, Ts);
	       Tu = VBYI(VADD(Tm, Tt));
	       Tw = VBYI(VSUB(Tt, Tm));
	       ST(&(xo[WS(os, 11)]), VSUB(Tj, Tu), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 5)]), VADD(Tv, Tw), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 1)]), VADD(Tj, Tu), ovs, &(xo[WS(os, 1)]));
	       ST(&(xo[WS(os, 7)]), VSUB(Tv, Tw), ovs, &(xo[WS(os, 1)]));
	  }
	  {
	       V Tz, TD, TC, TE, TA, TB;
	       Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty)));
	       TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx)));
	       TA = VADD(T5, Ta);
	       TB = VADD(Tp, Ts);
	       TC = VSUB(TA, TB);
	       TE = VADD(TA, TB);
	       ST(&(xo[WS(os, 2)]), VADD(Tz, TC), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 8)]), VSUB(TE, TD), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 10)]), VSUB(TC, Tz), ovs, &(xo[0]));
	       ST(&(xo[WS(os, 4)]), VADD(TD, TE), ovs, &(xo[0]));
	  }
     }
     END_SIMD();
}
示例#23
0
static const R *t1bv_12(R *ri, R *ii, const R *W, stride ios, int m, int dist)
{
     DVK(KP866025403, +0.866025403784438646763723170752936183471402627);
     DVK(KP500000000, +0.500000000000000000000000000000000000000000000);
     int i;
     R *x;
     x = ii;
     BEGIN_SIMD();
     for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 22)) {
	  V T1, Tt, T6, T7, TB, Tq, TC, TD, T9, Tu, Te, Tf, Tx, Tl, Ty;
	  V Tz;
	  {
	       V T5, T3, T4, T2;
	       T1 = LD(&(x[0]), dist, &(x[0]));
	       T4 = LD(&(x[WS(ios, 8)]), dist, &(x[0]));
	       T5 = BYTW(&(W[TWVL * 14]), T4);
	       T2 = LD(&(x[WS(ios, 4)]), dist, &(x[0]));
	       T3 = BYTW(&(W[TWVL * 6]), T2);
	       Tt = VSUB(T3, T5);
	       T6 = VADD(T3, T5);
	       T7 = VFNMS(LDK(KP500000000), T6, T1);
	  }
	  {
	       V Tn, Tp, Tm, TA, To;
	       Tm = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)]));
	       Tn = BYTW(&(W[0]), Tm);
	       TA = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)]));
	       TB = BYTW(&(W[TWVL * 16]), TA);
	       To = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)]));
	       Tp = BYTW(&(W[TWVL * 8]), To);
	       Tq = VSUB(Tn, Tp);
	       TC = VADD(Tn, Tp);
	       TD = VFNMS(LDK(KP500000000), TC, TB);
	  }
	  {
	       V Td, Tb, T8, Tc, Ta;
	       T8 = LD(&(x[WS(ios, 6)]), dist, &(x[0]));
	       T9 = BYTW(&(W[TWVL * 10]), T8);
	       Tc = LD(&(x[WS(ios, 2)]), dist, &(x[0]));
	       Td = BYTW(&(W[TWVL * 2]), Tc);
	       Ta = LD(&(x[WS(ios, 10)]), dist, &(x[0]));
	       Tb = BYTW(&(W[TWVL * 18]), Ta);
	       Tu = VSUB(Tb, Td);
	       Te = VADD(Tb, Td);
	       Tf = VFNMS(LDK(KP500000000), Te, T9);
	  }
	  {
	       V Ti, Tk, Th, Tw, Tj;
	       Th = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)]));
	       Ti = BYTW(&(W[TWVL * 12]), Th);
	       Tw = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)]));
	       Tx = BYTW(&(W[TWVL * 4]), Tw);
	       Tj = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)]));
	       Tk = BYTW(&(W[TWVL * 20]), Tj);
	       Tl = VSUB(Ti, Tk);
	       Ty = VADD(Ti, Tk);
	       Tz = VFNMS(LDK(KP500000000), Ty, Tx);
	  }
	  {
	       V Ts, TG, TF, TH;
	       {
		    V Tg, Tr, Tv, TE;
		    Tg = VSUB(T7, Tf);
		    Tr = VMUL(LDK(KP866025403), VSUB(Tl, Tq));
		    Ts = VSUB(Tg, Tr);
		    TG = VADD(Tg, Tr);
		    Tv = VMUL(LDK(KP866025403), VSUB(Tt, Tu));
		    TE = VSUB(Tz, TD);
		    TF = VBYI(VADD(Tv, TE));
		    TH = VBYI(VSUB(TE, Tv));
	       }
	       ST(&(x[WS(ios, 11)]), VSUB(Ts, TF), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 5)]), VADD(TG, TH), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 1)]), VADD(Ts, TF), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 7)]), VSUB(TG, TH), dist, &(x[WS(ios, 1)]));
	  }
	  {
	       V TS, TW, TV, TX;
	       {
		    V TQ, TR, TT, TU;
		    TQ = VADD(T1, T6);
		    TR = VADD(T9, Te);
		    TS = VSUB(TQ, TR);
		    TW = VADD(TQ, TR);
		    TT = VADD(Tx, Ty);
		    TU = VADD(TB, TC);
		    TV = VBYI(VSUB(TT, TU));
		    TX = VADD(TT, TU);
	       }
	       ST(&(x[WS(ios, 3)]), VSUB(TS, TV), dist, &(x[WS(ios, 1)]));
	       ST(&(x[0]), VADD(TW, TX), dist, &(x[0]));
	       ST(&(x[WS(ios, 9)]), VADD(TS, TV), dist, &(x[WS(ios, 1)]));
	       ST(&(x[WS(ios, 6)]), VSUB(TW, TX), dist, &(x[0]));
	  }
	  {
	       V TK, TO, TN, TP;
	       {
		    V TI, TJ, TL, TM;
		    TI = VADD(Tl, Tq);
		    TJ = VADD(Tt, Tu);
		    TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ)));
		    TO = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI)));
		    TL = VADD(T7, Tf);
		    TM = VADD(Tz, TD);
		    TN = VSUB(TL, TM);
		    TP = VADD(TL, TM);
	       }
	       ST(&(x[WS(ios, 2)]), VADD(TK, TN), dist, &(x[0]));
	       ST(&(x[WS(ios, 8)]), VSUB(TP, TO), dist, &(x[0]));
	       ST(&(x[WS(ios, 10)]), VSUB(TN, TK), dist, &(x[0]));
	       ST(&(x[WS(ios, 4)]), VADD(TO, TP), dist, &(x[0]));
	  }
     }
     END_SIMD();
     return W;
}
示例#24
0
static const R *q1fv_4(R *ri, R *ii, const R *W, stride is, stride vs, int m, int dist)
{
     int i;
     R *x;
     x = ri;
     BEGIN_SIMD();
     for (i = 0; i < m; i = i + VL, x = x + (VL * dist), W = W + (TWVL * 6)) {
	  V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th;
	  V Tl;
	  {
	       V T1, T2, Ty, Tz;
	       T1 = LD(&(x[0]), dist, &(x[0]));
	       T2 = LD(&(x[WS(is, 2)]), dist, &(x[0]));
	       T3 = VSUB(T1, T2);
	       T9 = VADD(T1, T2);
	       Ty = LD(&(x[WS(vs, 3)]), dist, &(x[WS(vs, 3)]));
	       Tz = LD(&(x[WS(vs, 3) + WS(is, 2)]), dist, &(x[WS(vs, 3)]));
	       TA = VSUB(Ty, Tz);
	       TG = VADD(Ty, Tz);
	  }
	  {
	       V TB, TC, T4, T5;
	       TB = LD(&(x[WS(vs, 3) + WS(is, 1)]), dist, &(x[WS(vs, 3) + WS(is, 1)]));
	       TC = LD(&(x[WS(vs, 3) + WS(is, 3)]), dist, &(x[WS(vs, 3) + WS(is, 1)]));
	       TD = VBYI(VSUB(TB, TC));
	       TH = VADD(TB, TC);
	       T4 = LD(&(x[WS(is, 1)]), dist, &(x[WS(is, 1)]));
	       T5 = LD(&(x[WS(is, 3)]), dist, &(x[WS(is, 1)]));
	       T6 = VBYI(VSUB(T4, T5));
	       Ta = VADD(T4, T5);
	  }
	  {
	       V Tc, Td, Tn, To;
	       Tc = LD(&(x[WS(vs, 1)]), dist, &(x[WS(vs, 1)]));
	       Td = LD(&(x[WS(vs, 1) + WS(is, 2)]), dist, &(x[WS(vs, 1)]));
	       Te = VSUB(Tc, Td);
	       Tk = VADD(Tc, Td);
	       Tn = LD(&(x[WS(vs, 2)]), dist, &(x[WS(vs, 2)]));
	       To = LD(&(x[WS(vs, 2) + WS(is, 2)]), dist, &(x[WS(vs, 2)]));
	       Tp = VSUB(Tn, To);
	       Tv = VADD(Tn, To);
	  }
	  {
	       V Tq, Tr, Tf, Tg;
	       Tq = LD(&(x[WS(vs, 2) + WS(is, 1)]), dist, &(x[WS(vs, 2) + WS(is, 1)]));
	       Tr = LD(&(x[WS(vs, 2) + WS(is, 3)]), dist, &(x[WS(vs, 2) + WS(is, 1)]));
	       Ts = VBYI(VSUB(Tq, Tr));
	       Tw = VADD(Tq, Tr);
	       Tf = LD(&(x[WS(vs, 1) + WS(is, 1)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
	       Tg = LD(&(x[WS(vs, 1) + WS(is, 3)]), dist, &(x[WS(vs, 1) + WS(is, 1)]));
	       Th = VBYI(VSUB(Tf, Tg));
	       Tl = VADD(Tf, Tg);
	  }
	  ST(&(x[0]), VADD(T9, Ta), dist, &(x[0]));
	  ST(&(x[WS(is, 1)]), VADD(Tk, Tl), dist, &(x[WS(is, 1)]));
	  ST(&(x[WS(is, 2)]), VADD(Tv, Tw), dist, &(x[0]));
	  ST(&(x[WS(is, 3)]), VADD(TG, TH), dist, &(x[WS(is, 1)]));
	  {
	       V T7, Ti, Tt, TE;
	       T7 = BYTWJ(&(W[0]), VSUB(T3, T6));
	       ST(&(x[WS(vs, 1)]), T7, dist, &(x[WS(vs, 1)]));
	       Ti = BYTWJ(&(W[0]), VSUB(Te, Th));
	       ST(&(x[WS(vs, 1) + WS(is, 1)]), Ti, dist, &(x[WS(vs, 1) + WS(is, 1)]));
	       Tt = BYTWJ(&(W[0]), VSUB(Tp, Ts));
	       ST(&(x[WS(vs, 1) + WS(is, 2)]), Tt, dist, &(x[WS(vs, 1)]));
	       TE = BYTWJ(&(W[0]), VSUB(TA, TD));
	       ST(&(x[WS(vs, 1) + WS(is, 3)]), TE, dist, &(x[WS(vs, 1) + WS(is, 1)]));
	  }
	  {
	       V T8, Tj, Tu, TF;
	       T8 = BYTWJ(&(W[TWVL * 4]), VADD(T3, T6));
	       ST(&(x[WS(vs, 3)]), T8, dist, &(x[WS(vs, 3)]));
	       Tj = BYTWJ(&(W[TWVL * 4]), VADD(Te, Th));
	       ST(&(x[WS(vs, 3) + WS(is, 1)]), Tj, dist, &(x[WS(vs, 3) + WS(is, 1)]));
	       Tu = BYTWJ(&(W[TWVL * 4]), VADD(Tp, Ts));
	       ST(&(x[WS(vs, 3) + WS(is, 2)]), Tu, dist, &(x[WS(vs, 3)]));
	       TF = BYTWJ(&(W[TWVL * 4]), VADD(TA, TD));
	       ST(&(x[WS(vs, 3) + WS(is, 3)]), TF, dist, &(x[WS(vs, 3) + WS(is, 1)]));
	  }
	  {
	       V Tb, Tm, Tx, TI;
	       Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta));
	       ST(&(x[WS(vs, 2)]), Tb, dist, &(x[WS(vs, 2)]));
	       Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl));
	       ST(&(x[WS(vs, 2) + WS(is, 1)]), Tm, dist, &(x[WS(vs, 2) + WS(is, 1)]));
	       Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw));
	       ST(&(x[WS(vs, 2) + WS(is, 2)]), Tx, dist, &(x[WS(vs, 2)]));
	       TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH));
	       ST(&(x[WS(vs, 2) + WS(is, 3)]), TI, dist, &(x[WS(vs, 2) + WS(is, 1)]));
	  }
     }
     END_SIMD();
     return W;
}
示例#25
0
static void n2bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs)
{
     DVK(KP900968867, +0.900968867902419126236102319507445051165919162);
     DVK(KP222520933, +0.222520933956314404288902564496794759466355569);
     DVK(KP623489801, +0.623489801858733530525004884004239810632274731);
     DVK(KP781831482, +0.781831482468029808708444526674057750232334519);
     DVK(KP974927912, +0.974927912181823607018131682993931217232785801);
     DVK(KP433883739, +0.433883739117558120475768332848358754609990728);
     int i;
     const R *xi;
     R *xo;
     xi = ii;
     xo = io;
     BEGIN_SIMD();
     for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) {
	  V Tp, Ty, Tl, TL, Tq, TE, T7, TJ, Ts, TB, Te, TK, Tr, TH, Tn;
	  V To;
	  Tn = LD(&(xi[0]), ivs, &(xi[0]));
	  To = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)]));
	  Tp = VSUB(Tn, To);
	  Ty = VADD(Tn, To);
	  {
	       V Th, TC, Tk, TD;
	       {
		    V Tf, Tg, Ti, Tj;
		    Tf = LD(&(xi[WS(is, 4)]), ivs, &(xi[0]));
		    Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)]));
		    Th = VSUB(Tf, Tg);
		    TC = VADD(Tf, Tg);
		    Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0]));
		    Tj = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)]));
		    Tk = VSUB(Ti, Tj);
		    TD = VADD(Ti, Tj);
	       }
	       Tl = VSUB(Th, Tk);
	       TL = VSUB(TD, TC);
	       Tq = VADD(Th, Tk);
	       TE = VADD(TC, TD);
	  }
	  {
	       V T3, Tz, T6, TA;
	       {
		    V T1, T2, T4, T5;
		    T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0]));
		    T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)]));
		    T3 = VSUB(T1, T2);
		    Tz = VADD(T1, T2);
		    T4 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0]));
		    T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)]));
		    T6 = VSUB(T4, T5);
		    TA = VADD(T4, T5);
	       }
	       T7 = VSUB(T3, T6);
	       TJ = VSUB(Tz, TA);
	       Ts = VADD(T3, T6);
	       TB = VADD(Tz, TA);
	  }
	  {
	       V Ta, TF, Td, TG;
	       {
		    V T8, T9, Tb, Tc;
		    T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0]));
		    T9 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)]));
		    Ta = VSUB(T8, T9);
		    TF = VADD(T8, T9);
		    Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0]));
		    Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)]));
		    Td = VSUB(Tb, Tc);
		    TG = VADD(Tb, Tc);
	       }
	       Te = VSUB(Ta, Td);
	       TK = VSUB(TG, TF);
	       Tr = VADD(Ta, Td);
	       TH = VADD(TF, TG);
	  }
	  ST(&(xo[14]), VADD(Tp, VADD(Ts, VADD(Tq, Tr))), ovs, &(xo[2]));
	  ST(&(xo[0]), VADD(Ty, VADD(TB, VADD(TE, TH))), ovs, &(xo[0]));
	  {
	       V Tm, Tt, TQ, TP;
	       Tm = VBYI(VFMA(LDK(KP433883739), T7, VFNMS(LDK(KP781831482), Tl, VMUL(LDK(KP974927912), Te))));
	       Tt = VFMA(LDK(KP623489801), Tq, VFNMS(LDK(KP222520933), Tr, VFNMS(LDK(KP900968867), Ts, Tp)));
	       ST(&(xo[6]), VADD(Tm, Tt), ovs, &(xo[2]));
	       ST(&(xo[22]), VSUB(Tt, Tm), ovs, &(xo[2]));
	       TQ = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK))));
	       TP = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TB, Ty)));
	       ST(&(xo[24]), VSUB(TP, TQ), ovs, &(xo[0]));
	       ST(&(xo[4]), VADD(TP, TQ), ovs, &(xo[0]));
	  }
	  {
	       V Tu, Tv, TM, TI;
	       Tu = VBYI(VFMA(LDK(KP781831482), T7, VFMA(LDK(KP974927912), Tl, VMUL(LDK(KP433883739), Te))));
	       Tv = VFMA(LDK(KP623489801), Ts, VFNMS(LDK(KP900968867), Tr, VFNMS(LDK(KP222520933), Tq, Tp)));
	       ST(&(xo[2]), VADD(Tu, Tv), ovs, &(xo[2]));
	       ST(&(xo[26]), VSUB(Tv, Tu), ovs, &(xo[2]));
	       TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ))));
	       TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TE, Ty)));
	       ST(&(xo[12]), VSUB(TI, TM), ovs, &(xo[0]));
	       ST(&(xo[16]), VADD(TI, TM), ovs, &(xo[0]));
	  }
	  {
	       V TO, TN, Tx, Tw;
	       TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL))));
	       TN = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP222520933), TH, VFNMS(LDK(KP900968867), TB, Ty)));
	       ST(&(xo[8]), VSUB(TN, TO), ovs, &(xo[0]));
	       ST(&(xo[20]), VADD(TN, TO), ovs, &(xo[0]));
	       Tx = VBYI(VFNMS(LDK(KP781831482), Te, VFNMS(LDK(KP433883739), Tl, VMUL(LDK(KP974927912), T7))));
	       Tw = VFMA(LDK(KP623489801), Tr, VFNMS(LDK(KP900968867), Tq, VFNMS(LDK(KP222520933), Ts, Tp)));
	       ST(&(xo[10]), VSUB(Tw, Tx), ovs, &(xo[2]));
	       ST(&(xo[18]), VADD(Tx, Tw), ovs, &(xo[2]));
	  }
     }
     END_SIMD();
}