static const R *t1fv_3(R *ri, R *ii, const R *W, stride ios, int m, int dist) { DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); int i; R *x; x = ri; BEGIN_SIMD(); for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 4)) { V T1, T3, T5, T6, T2, T4, T7, T8; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); T3 = BYTWJ(&(W[0]), T2); T4 = LD(&(x[WS(ios, 2)]), dist, &(x[0])); T5 = BYTWJ(&(W[TWVL * 2]), T4); T6 = VADD(T3, T5); ST(&(x[0]), VADD(T1, T6), dist, &(x[0])); T7 = VFNMS(LDK(KP500000000), T6, T1); T8 = VBYI(VMUL(LDK(KP866025403), VSUB(T5, T3))); ST(&(x[WS(ios, 2)]), VSUB(T7, T8), dist, &(x[0])); ST(&(x[WS(ios, 1)]), VADD(T7, T8), dist, &(x[WS(ios, 1)])); } END_SIMD(); return W; }
static void n1bv_4(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T3, T7, T6, T8; { V T1, T2, T4, T5; T1 = LD(&(xi[0]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T3 = VSUB(T1, T2); T7 = VADD(T1, T2); T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T6 = VBYI(VSUB(T4, T5)); T8 = VADD(T4, T5); } ST(&(xo[WS(os, 3)]), VSUB(T3, T6), ovs, &(xo[WS(os, 1)])); ST(&(xo[0]), VADD(T7, T8), ovs, &(xo[0])); ST(&(xo[WS(os, 1)]), VADD(T3, T6), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 2)]), VSUB(T7, T8), ovs, &(xo[0])); } END_SIMD(); }
static void n2bv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V Ta, Tk, Te, Tj, T7, Tn, Tf, Tm; { V T8, T9, Tc, Td; T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T9 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); Ta = VSUB(T8, T9); Tk = VADD(T8, T9); Tc = LD(&(xi[0]), ivs, &(xi[0])); Td = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Te = VSUB(Tc, Td); Tj = VADD(Tc, Td); { V T1, T2, T3, T4, T5, T6; T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T3 = VSUB(T1, T2); T4 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T6 = VSUB(T4, T5); T7 = VMUL(LDK(KP707106781), VSUB(T3, T6)); Tn = VADD(T4, T5); Tf = VMUL(LDK(KP707106781), VADD(T3, T6)); Tm = VADD(T1, T2); } } { V Tb, Tg, Tp, Tq; Tb = VBYI(VSUB(T7, Ta)); Tg = VSUB(Te, Tf); ST(&(xo[6]), VADD(Tb, Tg), ovs, &(xo[2])); ST(&(xo[10]), VSUB(Tg, Tb), ovs, &(xo[2])); Tp = VADD(Tj, Tk); Tq = VADD(Tm, Tn); ST(&(xo[8]), VSUB(Tp, Tq), ovs, &(xo[0])); ST(&(xo[0]), VADD(Tp, Tq), ovs, &(xo[0])); } { V Th, Ti, Tl, To; Th = VBYI(VADD(Ta, T7)); Ti = VADD(Te, Tf); ST(&(xo[2]), VADD(Th, Ti), ovs, &(xo[2])); ST(&(xo[14]), VSUB(Ti, Th), ovs, &(xo[2])); Tl = VSUB(Tj, Tk); To = VBYI(VSUB(Tm, Tn)); ST(&(xo[12]), VSUB(Tl, To), ovs, &(xo[0])); ST(&(xo[4]), VADD(Tl, To), ovs, &(xo[0])); } } END_SIMD(); }
static void m2bv_64(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { int i; BEGIN_SIMD(); for (i = 0; i < v; i += VL) { m2bv_64_0(ii, io, is, ivs, ovs); ii += VL * ivs; io += VL * ovs; } END_SIMD(); }
static void m1fv_32(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { int i; BEGIN_SIMD(); for (i = 0; i < v; i += VL) { m1fv_32_0(ri, ro, is, os, ivs, ovs); ri += VL * ivs; ro += VL * ovs; } END_SIMD(); }
static const R *t1bv_6(R *ri, R *ii, const R *W, stride ios, int m, int dist) { DVK(KP500000000, +0.500000000000000000000000000000000000000000000); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); int i; R *x; x = ii; BEGIN_SIMD(); for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 10)) { V Tf, Ti, Ta, Tk, T5, Tj, Tc, Te, Td; Tc = LD(&(x[0]), dist, &(x[0])); Td = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)])); Te = BYTW(&(W[TWVL * 4]), Td); Tf = VSUB(Tc, Te); Ti = VADD(Tc, Te); { V T7, T9, T6, T8; T6 = LD(&(x[WS(ios, 4)]), dist, &(x[0])); T7 = BYTW(&(W[TWVL * 6]), T6); T8 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); T9 = BYTW(&(W[0]), T8); Ta = VSUB(T7, T9); Tk = VADD(T7, T9); } { V T2, T4, T1, T3; T1 = LD(&(x[WS(ios, 2)]), dist, &(x[0])); T2 = BYTW(&(W[TWVL * 2]), T1); T3 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)])); T4 = BYTW(&(W[TWVL * 8]), T3); T5 = VSUB(T2, T4); Tj = VADD(T2, T4); } { V Tb, Tg, Th, Tn, Tl, Tm; Tb = VBYI(VMUL(LDK(KP866025403), VSUB(T5, Ta))); Tg = VADD(T5, Ta); Th = VFNMS(LDK(KP500000000), Tg, Tf); ST(&(x[WS(ios, 1)]), VADD(Tb, Th), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 3)]), VADD(Tf, Tg), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 5)]), VSUB(Th, Tb), dist, &(x[WS(ios, 1)])); Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tj, Tk))); Tl = VADD(Tj, Tk); Tm = VFNMS(LDK(KP500000000), Tl, Ti); ST(&(x[WS(ios, 2)]), VSUB(Tm, Tn), dist, &(x[0])); ST(&(x[0]), VADD(Ti, Tl), dist, &(x[0])); ST(&(x[WS(ios, 4)]), VADD(Tn, Tm), dist, &(x[0])); } } END_SIMD(); return W; }
static const R *t1fv_6(R *ri, R *ii, const R *W, stride ios, int m, int dist) { DVK(KP500000000, +0.500000000000000000000000000000000000000000000); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); int i; R *x; x = ri; BEGIN_SIMD(); for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 10)) { V T4, Ti, Te, Tk, T9, Tj, T1, T3, T2; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)])); T3 = BYTWJ(&(W[TWVL * 4]), T2); T4 = VSUB(T1, T3); Ti = VADD(T1, T3); { V Tb, Td, Ta, Tc; Ta = LD(&(x[WS(ios, 4)]), dist, &(x[0])); Tb = BYTWJ(&(W[TWVL * 6]), Ta); Tc = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); Td = BYTWJ(&(W[0]), Tc); Te = VSUB(Tb, Td); Tk = VADD(Tb, Td); } { V T6, T8, T5, T7; T5 = LD(&(x[WS(ios, 2)]), dist, &(x[0])); T6 = BYTWJ(&(W[TWVL * 2]), T5); T7 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)])); T8 = BYTWJ(&(W[TWVL * 8]), T7); T9 = VSUB(T6, T8); Tj = VADD(T6, T8); } { V Th, Tf, Tg, Tn, Tl, Tm; Th = VBYI(VMUL(LDK(KP866025403), VSUB(Te, T9))); Tf = VADD(T9, Te); Tg = VFNMS(LDK(KP500000000), Tf, T4); ST(&(x[WS(ios, 3)]), VADD(T4, Tf), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 1)]), VADD(Tg, Th), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 5)]), VSUB(Tg, Th), dist, &(x[WS(ios, 1)])); Tn = VBYI(VMUL(LDK(KP866025403), VSUB(Tk, Tj))); Tl = VADD(Tj, Tk); Tm = VFNMS(LDK(KP500000000), Tl, Ti); ST(&(x[0]), VADD(Ti, Tl), dist, &(x[0])); ST(&(x[WS(ios, 4)]), VADD(Tm, Tn), dist, &(x[0])); ST(&(x[WS(ios, 2)]), VSUB(Tm, Tn), dist, &(x[0])); } } END_SIMD(); return W; }
static void n2bv_7(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP222520933, +0.222520933956314404288902564496794759466355569); DVK(KP900968867, +0.900968867902419126236102319507445051165919162); DVK(KP623489801, +0.623489801858733530525004884004239810632274731); DVK(KP433883739, +0.433883739117558120475768332848358754609990728); DVK(KP781831482, +0.781831482468029808708444526674057750232334519); DVK(KP974927912, +0.974927912181823607018131682993931217232785801); int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V Tb, T9, Tc, T3, Te, T6, Td, T7, T8, Ti, Tj; Tb = LD(&(xi[0]), ivs, &(xi[0])); T7 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T8 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T9 = VSUB(T7, T8); Tc = VADD(T7, T8); { V T1, T2, T4, T5; T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T2 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T3 = VSUB(T1, T2); Te = VADD(T1, T2); T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T6 = VSUB(T4, T5); Td = VADD(T4, T5); } ST(&(xo[0]), VADD(Tb, VADD(Te, VADD(Tc, Td))), ovs, &(xo[0])); Ti = VBYI(VFNMS(LDK(KP781831482), T6, VFNMS(LDK(KP433883739), T9, VMUL(LDK(KP974927912), T3)))); Tj = VFMA(LDK(KP623489801), Td, VFNMS(LDK(KP900968867), Tc, VFNMS(LDK(KP222520933), Te, Tb))); ST(&(xo[4]), VADD(Ti, Tj), ovs, &(xo[0])); ST(&(xo[10]), VSUB(Tj, Ti), ovs, &(xo[2])); { V Ta, Tf, Tg, Th; Ta = VBYI(VFMA(LDK(KP433883739), T3, VFNMS(LDK(KP781831482), T9, VMUL(LDK(KP974927912), T6)))); Tf = VFMA(LDK(KP623489801), Tc, VFNMS(LDK(KP222520933), Td, VFNMS(LDK(KP900968867), Te, Tb))); ST(&(xo[6]), VADD(Ta, Tf), ovs, &(xo[2])); ST(&(xo[8]), VSUB(Tf, Ta), ovs, &(xo[0])); Tg = VBYI(VFMA(LDK(KP781831482), T3, VFMA(LDK(KP974927912), T9, VMUL(LDK(KP433883739), T6)))); Th = VFMA(LDK(KP623489801), Te, VFNMS(LDK(KP900968867), Td, VFNMS(LDK(KP222520933), Tc, Tb))); ST(&(xo[2]), VADD(Tg, Th), ovs, &(xo[2])); ST(&(xo[12]), VSUB(Th, Tg), ovs, &(xo[0])); } } END_SIMD(); }
static const R *t1fv_2(R *ri, R *ii, const R *W, stride ios, int m, int dist) { int i; R *x; x = ri; BEGIN_SIMD(); for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 2)) { V T1, T3, T2; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); T3 = BYTWJ(&(W[0]), T2); ST(&(x[WS(ios, 1)]), VSUB(T1, T3), dist, &(x[WS(ios, 1)])); ST(&(x[0]), VADD(T1, T3), dist, &(x[0])); } END_SIMD(); return W; }
static void n2fv_2(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T1, T2; T1 = LD(&(xi[0]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); ST(&(xo[2]), VSUB(T1, T2), ovs, &(xo[2])); ST(&(xo[0]), VADD(T1, T2), ovs, &(xo[0])); } END_SIMD(); }
static void n2fv_5(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP250000000, +0.250000000000000000000000000000000000000000000); DVK(KP587785252, +0.587785252292473129168705954639072768597652438); DVK(KP951056516, +0.951056516295153572116439333379382143405698634); DVK(KP559016994, +0.559016994374947424102293417182819058860154590); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T8, T7, Td, T9, Tc; T8 = LD(&(xi[0]), ivs, &(xi[0])); { V T1, T2, T3, T4, T5, T6; T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T3 = VADD(T1, T2); T4 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T6 = VADD(T4, T5); T7 = VMUL(LDK(KP559016994), VSUB(T3, T6)); Td = VSUB(T4, T5); T9 = VADD(T3, T6); Tc = VSUB(T1, T2); } ST(&(xo[0]), VADD(T8, T9), ovs, &(xo[0])); { V Te, Tf, Tb, Tg, Ta; Te = VBYI(VFMA(LDK(KP951056516), Tc, VMUL(LDK(KP587785252), Td))); Tf = VBYI(VFNMS(LDK(KP587785252), Tc, VMUL(LDK(KP951056516), Td))); Ta = VFNMS(LDK(KP250000000), T9, T8); Tb = VADD(T7, Ta); Tg = VSUB(Ta, T7); ST(&(xo[2]), VSUB(Tb, Te), ovs, &(xo[2])); ST(&(xo[6]), VSUB(Tg, Tf), ovs, &(xo[2])); ST(&(xo[8]), VADD(Te, Tb), ovs, &(xo[0])); ST(&(xo[4]), VADD(Tf, Tg), ovs, &(xo[0])); } } END_SIMD(); }
static const R *q1fv_2(R *ri, R *ii, const R *W, stride is, stride vs, int m, int dist) { int i; R *x; x = ri; BEGIN_SIMD(); for (i = 0; i < m; i = i + VL, x = x + (VL * dist), W = W + (TWVL * 2)) { V T1, T2, T3, T4, T5, T6; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(is, 1)]), dist, &(x[WS(is, 1)])); T3 = BYTWJ(&(W[0]), VSUB(T1, T2)); T4 = LD(&(x[WS(vs, 1)]), dist, &(x[WS(vs, 1)])); T5 = LD(&(x[WS(vs, 1) + WS(is, 1)]), dist, &(x[WS(vs, 1) + WS(is, 1)])); T6 = BYTWJ(&(W[0]), VSUB(T4, T5)); ST(&(x[WS(vs, 1)]), T3, dist, &(x[WS(vs, 1)])); ST(&(x[WS(vs, 1) + WS(is, 1)]), T6, dist, &(x[WS(vs, 1) + WS(is, 1)])); ST(&(x[0]), VADD(T1, T2), dist, &(x[0])); ST(&(x[WS(is, 1)]), VADD(T4, T5), dist, &(x[WS(is, 1)])); } END_SIMD(); return W; }
static void n2fv_3(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP500000000, +0.500000000000000000000000000000000000000000000); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T1, T4, T6, T2, T3, T5; T1 = LD(&(xi[0]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T3 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T4 = VADD(T2, T3); T6 = VBYI(VMUL(LDK(KP866025403), VSUB(T3, T2))); ST(&(xo[0]), VADD(T1, T4), ovs, &(xo[0])); T5 = VFNMS(LDK(KP500000000), T4, T1); ST(&(xo[4]), VSUB(T5, T6), ovs, &(xo[0])); ST(&(xo[2]), VADD(T5, T6), ovs, &(xo[2])); } END_SIMD(); }
static void n2bv_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP342020143, +0.342020143325668733044099614682259580763083368); DVK(KP813797681, +0.813797681349373692844693217248393223289101568); DVK(KP939692620, +0.939692620785908384054109277324731469936208134); DVK(KP296198132, +0.296198132726023843175338011893050938967728390); DVK(KP642787609, +0.642787609686539326322643409907263432907559884); DVK(KP663413948, +0.663413948168938396205421319635891297216863310); DVK(KP556670399, +0.556670399226419366452912952047023132968291906); DVK(KP766044443, +0.766044443118978035202392650555416673935832457); DVK(KP984807753, +0.984807753012208059366743024589523013670643252); DVK(KP150383733, +0.150383733180435296639271897612501926072238258); DVK(KP852868531, +0.852868531952443209628250963940074071936020296); DVK(KP173648177, +0.173648177666930348851716626769314796000375677); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T5, Ty, Tm, Ti, Tw, Th, Tj, To, Tb, Tv, Ta, Tc, Tn; { V T1, T2, T3, T4; T1 = LD(&(xi[0]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T3 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T4 = VADD(T2, T3); T5 = VFNMS(LDK(KP500000000), T4, T1); Ty = VADD(T1, T4); Tm = VMUL(LDK(KP866025403), VSUB(T2, T3)); } { V Td, Tg, Te, Tf; Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); Tf = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); Tg = VADD(Te, Tf); Ti = VSUB(Te, Tf); Tw = VADD(Td, Tg); Th = VFNMS(LDK(KP500000000), Tg, Td); Tj = VFNMS(LDK(KP852868531), Ti, VMUL(LDK(KP173648177), Th)); To = VFMA(LDK(KP150383733), Ti, VMUL(LDK(KP984807753), Th)); } { V T6, T9, T7, T8; T6 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T7 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T8 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T9 = VADD(T7, T8); Tb = VSUB(T7, T8); Tv = VADD(T6, T9); Ta = VFNMS(LDK(KP500000000), T9, T6); Tc = VFNMS(LDK(KP556670399), Tb, VMUL(LDK(KP766044443), Ta)); Tn = VFMA(LDK(KP663413948), Tb, VMUL(LDK(KP642787609), Ta)); } { V Tx, Tz, TA, Tt, Tu; Tx = VBYI(VMUL(LDK(KP866025403), VSUB(Tv, Tw))); Tz = VADD(Tv, Tw); TA = VFNMS(LDK(KP500000000), Tz, Ty); ST(&(xo[6]), VADD(Tx, TA), ovs, &(xo[2])); ST(&(xo[0]), VADD(Ty, Tz), ovs, &(xo[0])); ST(&(xo[12]), VSUB(TA, Tx), ovs, &(xo[0])); Tt = VFMA(LDK(KP852868531), Tb, VFMA(LDK(KP173648177), Ta, VFMA(LDK(KP296198132), Ti, VFNMS(LDK(KP939692620), Th, T5)))); Tu = VBYI(VSUB(VFMA(LDK(KP984807753), Ta, VFMA(LDK(KP813797681), Ti, VFNMS(LDK(KP150383733), Tb, VMUL(LDK(KP342020143), Th)))), Tm)); ST(&(xo[14]), VSUB(Tt, Tu), ovs, &(xo[2])); ST(&(xo[4]), VADD(Tt, Tu), ovs, &(xo[0])); { V Tl, Ts, Tq, Tr, Tk, Tp; Tk = VADD(Tc, Tj); Tl = VADD(T5, Tk); Ts = VFMA(LDK(KP866025403), VSUB(To, Tn), VFNMS(LDK(KP500000000), Tk, T5)); Tp = VADD(Tn, To); Tq = VBYI(VADD(Tm, Tp)); Tr = VBYI(VADD(Tm, VFNMS(LDK(KP500000000), Tp, VMUL(LDK(KP866025403), VSUB(Tc, Tj))))); ST(&(xo[16]), VSUB(Tl, Tq), ovs, &(xo[0])); ST(&(xo[10]), VSUB(Ts, Tr), ovs, &(xo[2])); ST(&(xo[2]), VADD(Tl, Tq), ovs, &(xo[2])); ST(&(xo[8]), VADD(Tr, Ts), ovs, &(xo[0])); } } } END_SIMD(); }
static void n1fv_15(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP216506350, +0.216506350946109661690930792688234045867850657); DVK(KP509036960, +0.509036960455127183450980863393907648510733164); DVK(KP823639103, +0.823639103546331925877420039278190003029660514); DVK(KP587785252, +0.587785252292473129168705954639072768597652438); DVK(KP951056516, +0.951056516295153572116439333379382143405698634); DVK(KP250000000, +0.250000000000000000000000000000000000000000000); DVK(KP559016994, +0.559016994374947424102293417182819058860154590); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP484122918, +0.484122918275927110647408174972799951354115213); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T5, T10, TB, TO, TU, TV, TR, Ta, Tf, Tg, Tl, Tq, Tr, TE, TH; V TI, TZ, T11, T1f, T1g; { V T1, T2, T3, T4; T1 = LD(&(xi[0]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T3 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); T4 = VADD(T2, T3); T5 = VADD(T1, T4); T10 = VSUB(T3, T2); TB = VFNMS(LDK(KP500000000), T4, T1); } { V T6, T9, TC, TP, Tm, Tp, TG, TN, Tb, Te, TD, TQ, Th, Tk, TF; V TM, TX, TY; { V T7, T8, Tn, To; T6 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T7 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T8 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)])); T9 = VADD(T7, T8); TC = VFNMS(LDK(KP500000000), T9, T6); TP = VSUB(T8, T7); Tm = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Tn = LD(&(xi[WS(is, 14)]), ivs, &(xi[0])); To = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Tp = VADD(Tn, To); TG = VFNMS(LDK(KP500000000), Tp, Tm); TN = VSUB(To, Tn); } { V Tc, Td, Ti, Tj; Tb = LD(&(xi[WS(is, 12)]), ivs, &(xi[0])); Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); Te = VADD(Tc, Td); TD = VFNMS(LDK(KP500000000), Te, Tb); TQ = VSUB(Td, Tc); Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); Ti = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); Tj = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Tk = VADD(Ti, Tj); TF = VFNMS(LDK(KP500000000), Tk, Th); TM = VSUB(Tj, Ti); } TO = VSUB(TM, TN); TU = VSUB(TF, TG); TV = VSUB(TC, TD); TR = VSUB(TP, TQ); Ta = VADD(T6, T9); Tf = VADD(Tb, Te); Tg = VADD(Ta, Tf); Tl = VADD(Th, Tk); Tq = VADD(Tm, Tp); Tr = VADD(Tl, Tq); TE = VADD(TC, TD); TH = VADD(TF, TG); TI = VADD(TE, TH); TX = VADD(TP, TQ); TY = VADD(TM, TN); TZ = VMUL(LDK(KP484122918), VSUB(TX, TY)); T11 = VADD(TX, TY); } T1f = VADD(TB, TI); T1g = VBYI(VMUL(LDK(KP866025403), VADD(T10, T11))); ST(&(xo[WS(os, 5)]), VSUB(T1f, T1g), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 10)]), VADD(T1f, T1g), ovs, &(xo[0])); { V Tu, Ts, Tt, Ty, TA, Tw, Tx, Tz, Tv; Tu = VMUL(LDK(KP559016994), VSUB(Tg, Tr)); Ts = VADD(Tg, Tr); Tt = VFNMS(LDK(KP250000000), Ts, T5); Tw = VSUB(Tl, Tq); Tx = VSUB(Ta, Tf); Ty = VBYI(VFNMS(LDK(KP587785252), Tx, VMUL(LDK(KP951056516), Tw))); TA = VBYI(VFMA(LDK(KP951056516), Tx, VMUL(LDK(KP587785252), Tw))); ST(&(xo[0]), VADD(T5, Ts), ovs, &(xo[0])); Tz = VADD(Tu, Tt); ST(&(xo[WS(os, 6)]), VSUB(Tz, TA), ovs, &(xo[0])); ST(&(xo[WS(os, 9)]), VADD(TA, Tz), ovs, &(xo[WS(os, 1)])); Tv = VSUB(Tt, Tu); ST(&(xo[WS(os, 3)]), VSUB(Tv, Ty), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 12)]), VADD(Ty, Tv), ovs, &(xo[0])); } { V TS, TW, T1b, T18, T13, T1a, TL, T17, T12, TJ, TK; TS = VFNMS(LDK(KP509036960), TR, VMUL(LDK(KP823639103), TO)); TW = VFNMS(LDK(KP587785252), TV, VMUL(LDK(KP951056516), TU)); T1b = VFMA(LDK(KP951056516), TV, VMUL(LDK(KP587785252), TU)); T18 = VFMA(LDK(KP823639103), TR, VMUL(LDK(KP509036960), TO)); T12 = VFNMS(LDK(KP216506350), T11, VMUL(LDK(KP866025403), T10)); T13 = VSUB(TZ, T12); T1a = VADD(TZ, T12); TJ = VFNMS(LDK(KP250000000), TI, TB); TK = VMUL(LDK(KP559016994), VSUB(TE, TH)); TL = VSUB(TJ, TK); T17 = VADD(TK, TJ); { V TT, T14, T1d, T1e; TT = VSUB(TL, TS); T14 = VBYI(VSUB(TW, T13)); ST(&(xo[WS(os, 8)]), VSUB(TT, T14), ovs, &(xo[0])); ST(&(xo[WS(os, 7)]), VADD(TT, T14), ovs, &(xo[WS(os, 1)])); T1d = VSUB(T17, T18); T1e = VBYI(VADD(T1b, T1a)); ST(&(xo[WS(os, 11)]), VSUB(T1d, T1e), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 4)]), VADD(T1d, T1e), ovs, &(xo[0])); } { V T15, T16, T19, T1c; T15 = VADD(TL, TS); T16 = VBYI(VADD(TW, T13)); ST(&(xo[WS(os, 13)]), VSUB(T15, T16), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 2)]), VADD(T15, T16), ovs, &(xo[0])); T19 = VADD(T17, T18); T1c = VBYI(VSUB(T1a, T1b)); ST(&(xo[WS(os, 14)]), VSUB(T19, T1c), ovs, &(xo[0])); ST(&(xo[WS(os, 1)]), VADD(T19, T1c), ovs, &(xo[WS(os, 1)])); } } } END_SIMD(); }
static void n1fv_10(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP250000000, +0.250000000000000000000000000000000000000000000); DVK(KP559016994, +0.559016994374947424102293417182819058860154590); DVK(KP587785252, +0.587785252292473129168705954639072768597652438); DVK(KP951056516, +0.951056516295153572116439333379382143405698634); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V Ti, Ty, Tm, Tn, Tw, Tt, Tz, TA, TB, T7, Te, Tj, Tg, Th; Tg = LD(&(xi[0]), ivs, &(xi[0])); Th = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); Ti = VSUB(Tg, Th); Ty = VADD(Tg, Th); { V T3, Tu, Td, Ts, T6, Tv, Ta, Tr; { V T1, T2, Tb, Tc; T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T3 = VSUB(T1, T2); Tu = VADD(T1, T2); Tb = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Td = VSUB(Tb, Tc); Ts = VADD(Tb, Tc); } { V T4, T5, T8, T9; T4 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T5 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T6 = VSUB(T4, T5); Tv = VADD(T4, T5); T8 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Ta = VSUB(T8, T9); Tr = VADD(T8, T9); } Tm = VSUB(T3, T6); Tn = VSUB(Ta, Td); Tw = VSUB(Tu, Tv); Tt = VSUB(Tr, Ts); Tz = VADD(Tu, Tv); TA = VADD(Tr, Ts); TB = VADD(Tz, TA); T7 = VADD(T3, T6); Te = VADD(Ta, Td); Tj = VADD(T7, Te); } ST(&(xo[WS(os, 5)]), VADD(Ti, Tj), ovs, &(xo[WS(os, 1)])); ST(&(xo[0]), VADD(Ty, TB), ovs, &(xo[0])); { V To, Tq, Tl, Tp, Tf, Tk; To = VBYI(VFMA(LDK(KP951056516), Tm, VMUL(LDK(KP587785252), Tn))); Tq = VBYI(VFNMS(LDK(KP587785252), Tm, VMUL(LDK(KP951056516), Tn))); Tf = VMUL(LDK(KP559016994), VSUB(T7, Te)); Tk = VFNMS(LDK(KP250000000), Tj, Ti); Tl = VADD(Tf, Tk); Tp = VSUB(Tk, Tf); ST(&(xo[WS(os, 1)]), VSUB(Tl, To), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 7)]), VADD(Tq, Tp), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 9)]), VADD(To, Tl), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 3)]), VSUB(Tp, Tq), ovs, &(xo[WS(os, 1)])); } { V Tx, TF, TE, TG, TC, TD; Tx = VBYI(VFNMS(LDK(KP587785252), Tw, VMUL(LDK(KP951056516), Tt))); TF = VBYI(VFMA(LDK(KP951056516), Tw, VMUL(LDK(KP587785252), Tt))); TC = VFNMS(LDK(KP250000000), TB, Ty); TD = VMUL(LDK(KP559016994), VSUB(Tz, TA)); TE = VSUB(TC, TD); TG = VADD(TD, TC); ST(&(xo[WS(os, 2)]), VADD(Tx, TE), ovs, &(xo[0])); ST(&(xo[WS(os, 6)]), VSUB(TG, TF), ovs, &(xo[0])); ST(&(xo[WS(os, 8)]), VSUB(TE, Tx), ovs, &(xo[0])); ST(&(xo[WS(os, 4)]), VADD(TF, TG), ovs, &(xo[0])); } } END_SIMD(); }
static void n2fv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DVK(KP083333333, +0.083333333333333333333333333333333333333333333); DVK(KP075902986, +0.075902986037193865983102897245103540356428373); DVK(KP251768516, +0.251768516431883313623436926934233488546674281); DVK(KP132983124, +0.132983124607418643793760531921092974399165133); DVK(KP258260390, +0.258260390311744861420450644284508567852516811); DVK(KP1_732050807, +1.732050807568877293527446341505872366942805254); DVK(KP300238635, +0.300238635966332641462884626667381504676006424); DVK(KP011599105, +0.011599105605768290721655456654083252189827041); DVK(KP156891391, +0.156891391051584611046832726756003269660212636); DVK(KP256247671, +0.256247671582936600958684654061725059144125175); DVK(KP174138601, +0.174138601152135905005660794929264742616964676); DVK(KP575140729, +0.575140729474003121368385547455453388461001608); DVK(KP503537032, +0.503537032863766627246873853868466977093348562); DVK(KP113854479, +0.113854479055790798974654345867655310534642560); DVK(KP265966249, +0.265966249214837287587521063842185948798330267); DVK(KP387390585, +0.387390585467617292130675966426762851778775217); DVK(KP300462606, +0.300462606288665774426601772289207995520941381); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V TW, Tb, Tm, Tu, TC, TR, TX, TK, TU, Tz, TB, TN, TT; TW = LD(&(xi[0]), ivs, &(xi[0])); { V T3, TH, Tl, Tw, Tp, Tg, Tv, To, T6, Tr, T9, Ts, Ta, TI, T1; V T2, Tq, Tt; T1 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T3 = VSUB(T1, T2); TH = VADD(T1, T2); { V Th, Ti, Tj, Tk; Th = LD(&(xi[WS(is, 12)]), ivs, &(xi[0])); Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); Tj = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Tk = VADD(Ti, Tj); Tl = VADD(Th, Tk); Tw = VSUB(Ti, Tj); Tp = VFNMS(LDK(KP500000000), Tk, Th); } { V Tc, Td, Te, Tf; Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Td = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Tf = VADD(Td, Te); Tg = VADD(Tc, Tf); Tv = VSUB(Td, Te); To = VFNMS(LDK(KP500000000), Tf, Tc); } { V T4, T5, T7, T8; T4 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T6 = VSUB(T4, T5); Tr = VADD(T4, T5); T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T9 = VSUB(T7, T8); Ts = VADD(T7, T8); } Ta = VADD(T6, T9); TI = VADD(Tr, Ts); Tb = VADD(T3, Ta); Tm = VSUB(Tg, Tl); Tq = VSUB(To, Tp); Tt = VMUL(LDK(KP866025403), VSUB(Tr, Ts)); Tu = VADD(Tq, Tt); TC = VSUB(Tq, Tt); { V TP, TQ, TG, TJ; TP = VADD(Tg, Tl); TQ = VADD(TH, TI); TR = VMUL(LDK(KP300462606), VSUB(TP, TQ)); TX = VADD(TP, TQ); TG = VADD(To, Tp); TJ = VFNMS(LDK(KP500000000), TI, TH); TK = VSUB(TG, TJ); TU = VADD(TG, TJ); } { V Tx, Ty, TL, TM; Tx = VMUL(LDK(KP866025403), VSUB(Tv, Tw)); Ty = VFNMS(LDK(KP500000000), Ta, T3); Tz = VSUB(Tx, Ty); TB = VADD(Tx, Ty); TL = VADD(Tv, Tw); TM = VSUB(T6, T9); TN = VSUB(TL, TM); TT = VADD(TL, TM); } } ST(&(xo[0]), VADD(TW, TX), ovs, &(xo[0])); { V T19, T1n, T14, T13, T1f, T1k, Tn, TE, T1e, T1j, TS, T1m, TZ, T1c, TA; V TD; { V T17, T18, T11, T12; T17 = VFMA(LDK(KP387390585), TN, VMUL(LDK(KP265966249), TK)); T18 = VFNMS(LDK(KP503537032), TU, VMUL(LDK(KP113854479), TT)); T19 = VSUB(T17, T18); T1n = VADD(T17, T18); T14 = VFMA(LDK(KP575140729), Tm, VMUL(LDK(KP174138601), Tb)); T11 = VFNMS(LDK(KP156891391), TB, VMUL(LDK(KP256247671), TC)); T12 = VFMA(LDK(KP011599105), Tz, VMUL(LDK(KP300238635), Tu)); T13 = VSUB(T11, T12); T1f = VADD(T14, T13); T1k = VMUL(LDK(KP1_732050807), VADD(T11, T12)); } Tn = VFNMS(LDK(KP174138601), Tm, VMUL(LDK(KP575140729), Tb)); TA = VFNMS(LDK(KP300238635), Tz, VMUL(LDK(KP011599105), Tu)); TD = VFMA(LDK(KP256247671), TB, VMUL(LDK(KP156891391), TC)); TE = VSUB(TA, TD); T1e = VMUL(LDK(KP1_732050807), VADD(TD, TA)); T1j = VSUB(Tn, TE); { V TO, T1b, TV, TY, T1a; TO = VFNMS(LDK(KP132983124), TN, VMUL(LDK(KP258260390), TK)); T1b = VSUB(TR, TO); TV = VFMA(LDK(KP251768516), TT, VMUL(LDK(KP075902986), TU)); TY = VFNMS(LDK(KP083333333), TX, TW); T1a = VSUB(TY, TV); TS = VFMA(LDK(KP2_000000000), TO, TR); T1m = VADD(T1b, T1a); TZ = VFMA(LDK(KP2_000000000), TV, TY); T1c = VSUB(T1a, T1b); } { V TF, T10, T1l, T1o; TF = VBYI(VFMA(LDK(KP2_000000000), TE, Tn)); T10 = VADD(TS, TZ); ST(&(xo[2]), VADD(TF, T10), ovs, &(xo[2])); ST(&(xo[24]), VSUB(T10, TF), ovs, &(xo[0])); { V T15, T16, T1p, T1q; T15 = VBYI(VFMS(LDK(KP2_000000000), T13, T14)); T16 = VSUB(TZ, TS); ST(&(xo[10]), VADD(T15, T16), ovs, &(xo[2])); ST(&(xo[16]), VSUB(T16, T15), ovs, &(xo[0])); T1p = VADD(T1n, T1m); T1q = VBYI(VADD(T1j, T1k)); ST(&(xo[8]), VSUB(T1p, T1q), ovs, &(xo[0])); ST(&(xo[18]), VADD(T1q, T1p), ovs, &(xo[2])); } T1l = VBYI(VSUB(T1j, T1k)); T1o = VSUB(T1m, T1n); ST(&(xo[6]), VADD(T1l, T1o), ovs, &(xo[2])); ST(&(xo[20]), VSUB(T1o, T1l), ovs, &(xo[0])); { V T1h, T1i, T1d, T1g; T1h = VBYI(VSUB(T1e, T1f)); T1i = VSUB(T1c, T19); ST(&(xo[12]), VADD(T1h, T1i), ovs, &(xo[0])); ST(&(xo[14]), VSUB(T1i, T1h), ovs, &(xo[2])); T1d = VADD(T19, T1c); T1g = VBYI(VADD(T1e, T1f)); ST(&(xo[4]), VSUB(T1d, T1g), ovs, &(xo[0])); ST(&(xo[22]), VADD(T1g, T1d), ovs, &(xo[2])); } } } } END_SIMD(); }
static void n1bv_11(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP959492973, +0.959492973614497389890368057066327699062454848); DVK(KP654860733, +0.654860733945285064056925072466293553183791199); DVK(KP142314838, +0.142314838273285140443792668616369668791051361); DVK(KP415415013, +0.415415013001886425529274149229623203524004910); DVK(KP841253532, +0.841253532831181168861811648919367717513292498); DVK(KP540640817, +0.540640817455597582107635954318691695431770608); DVK(KP909631995, +0.909631995354518371411715383079028460060241051); DVK(KP989821441, +0.989821441880932732376092037776718787376519372); DVK(KP755749574, +0.755749574354258283774035843972344420179717445); DVK(KP281732556, +0.281732556841429697711417915346616899035777899); int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V Th, T3, Tm, Tf, Ti, Tc, Tj, T9, Tk, T6, Tl, Ta, Tb, Ts, Tt; Th = LD(&(xi[0]), ivs, &(xi[0])); { V T1, T2, Td, Te; T1 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T2 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); T3 = VSUB(T1, T2); Tm = VADD(T1, T2); Td = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Tf = VSUB(Td, Te); Ti = VADD(Td, Te); } Ta = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Tb = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); Tc = VSUB(Ta, Tb); Tj = VADD(Ta, Tb); { V T7, T8, T4, T5; T7 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T9 = VSUB(T7, T8); Tk = VADD(T7, T8); T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T6 = VSUB(T4, T5); Tl = VADD(T4, T5); } ST(&(xo[0]), VADD(Th, VADD(Tm, VADD(Ti, VADD(Tl, VADD(Tj, Tk))))), ovs, &(xo[0])); { V Tg, Tn, Tu, Tv; Tg = VBYI(VFMA(LDK(KP281732556), T3, VFMA(LDK(KP755749574), T6, VFNMS(LDK(KP909631995), Tc, VFNMS(LDK(KP540640817), Tf, VMUL(LDK(KP989821441), T9)))))); Tn = VFMA(LDK(KP841253532), Ti, VFMA(LDK(KP415415013), Tj, VFNMS(LDK(KP142314838), Tk, VFNMS(LDK(KP654860733), Tl, VFNMS(LDK(KP959492973), Tm, Th))))); ST(&(xo[WS(os, 5)]), VADD(Tg, Tn), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 6)]), VSUB(Tn, Tg), ovs, &(xo[0])); Tu = VBYI(VFMA(LDK(KP755749574), T3, VFMA(LDK(KP540640817), T6, VFNMS(LDK(KP909631995), T9, VFNMS(LDK(KP989821441), Tf, VMUL(LDK(KP281732556), Tc)))))); Tv = VFMA(LDK(KP841253532), Tl, VFMA(LDK(KP415415013), Tk, VFNMS(LDK(KP959492973), Tj, VFNMS(LDK(KP142314838), Ti, VFNMS(LDK(KP654860733), Tm, Th))))); ST(&(xo[WS(os, 4)]), VADD(Tu, Tv), ovs, &(xo[0])); ST(&(xo[WS(os, 7)]), VSUB(Tv, Tu), ovs, &(xo[WS(os, 1)])); } Ts = VBYI(VFMA(LDK(KP909631995), T3, VFNMS(LDK(KP540640817), T9, VFNMS(LDK(KP989821441), Tc, VFNMS(LDK(KP281732556), T6, VMUL(LDK(KP755749574), Tf)))))); Tt = VFMA(LDK(KP415415013), Tm, VFMA(LDK(KP841253532), Tk, VFNMS(LDK(KP142314838), Tj, VFNMS(LDK(KP959492973), Tl, VFNMS(LDK(KP654860733), Ti, Th))))); ST(&(xo[WS(os, 2)]), VADD(Ts, Tt), ovs, &(xo[0])); ST(&(xo[WS(os, 9)]), VSUB(Tt, Ts), ovs, &(xo[WS(os, 1)])); { V Tq, Tr, To, Tp; Tq = VBYI(VFMA(LDK(KP540640817), T3, VFMA(LDK(KP909631995), Tf, VFMA(LDK(KP989821441), T6, VFMA(LDK(KP755749574), Tc, VMUL(LDK(KP281732556), T9)))))); Tr = VFMA(LDK(KP841253532), Tm, VFMA(LDK(KP415415013), Ti, VFNMS(LDK(KP959492973), Tk, VFNMS(LDK(KP654860733), Tj, VFNMS(LDK(KP142314838), Tl, Th))))); ST(&(xo[WS(os, 1)]), VADD(Tq, Tr), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 10)]), VSUB(Tr, Tq), ovs, &(xo[0])); To = VBYI(VFMA(LDK(KP989821441), T3, VFMA(LDK(KP540640817), Tc, VFNMS(LDK(KP909631995), T6, VFNMS(LDK(KP281732556), Tf, VMUL(LDK(KP755749574), T9)))))); Tp = VFMA(LDK(KP415415013), Tl, VFMA(LDK(KP841253532), Tj, VFNMS(LDK(KP654860733), Tk, VFNMS(LDK(KP959492973), Ti, VFNMS(LDK(KP142314838), Tm, Th))))); ST(&(xo[WS(os, 3)]), VADD(To, Tp), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 8)]), VSUB(Tp, To), ovs, &(xo[0])); } } END_SIMD(); }
static void n2fv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP500000000, +0.500000000000000000000000000000000000000000000); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T5, Ta, TJ, Ty, Tq, Tp, Tg, Tl, TI, TA, Tz, Tu; { V T1, T6, T4, Tw, T9, Tx; T1 = LD(&(xi[0]), ivs, &(xi[0])); T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); { V T2, T3, T7, T8; T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T4 = VADD(T2, T3); Tw = VSUB(T3, T2); T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T9 = VADD(T7, T8); Tx = VSUB(T8, T7); } T5 = VADD(T1, T4); Ta = VADD(T6, T9); TJ = VADD(Tw, Tx); Ty = VMUL(LDK(KP866025403), VSUB(Tw, Tx)); Tq = VFNMS(LDK(KP500000000), T9, T6); Tp = VFNMS(LDK(KP500000000), T4, T1); } { V Tc, Th, Tf, Ts, Tk, Tt; Tc = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); Th = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); { V Td, Te, Ti, Tj; Td = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); Te = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); Tf = VADD(Td, Te); Ts = VSUB(Te, Td); Ti = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Tj = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); Tk = VADD(Ti, Tj); Tt = VSUB(Tj, Ti); } Tg = VADD(Tc, Tf); Tl = VADD(Th, Tk); TI = VADD(Ts, Tt); TA = VFNMS(LDK(KP500000000), Tk, Th); Tz = VFNMS(LDK(KP500000000), Tf, Tc); Tu = VMUL(LDK(KP866025403), VSUB(Ts, Tt)); } { V Tb, Tm, Tn, To; Tb = VSUB(T5, Ta); Tm = VBYI(VSUB(Tg, Tl)); ST(&(xo[18]), VSUB(Tb, Tm), ovs, &(xo[2])); ST(&(xo[6]), VADD(Tb, Tm), ovs, &(xo[2])); Tn = VADD(T5, Ta); To = VADD(Tg, Tl); ST(&(xo[12]), VSUB(Tn, To), ovs, &(xo[0])); ST(&(xo[0]), VADD(Tn, To), ovs, &(xo[0])); } { V Tv, TE, TC, TD, Tr, TB; Tr = VSUB(Tp, Tq); Tv = VSUB(Tr, Tu); TE = VADD(Tr, Tu); TB = VSUB(Tz, TA); TC = VBYI(VADD(Ty, TB)); TD = VBYI(VSUB(Ty, TB)); ST(&(xo[10]), VSUB(Tv, TC), ovs, &(xo[2])); ST(&(xo[22]), VSUB(TE, TD), ovs, &(xo[2])); ST(&(xo[14]), VADD(TC, Tv), ovs, &(xo[2])); ST(&(xo[2]), VADD(TD, TE), ovs, &(xo[2])); } { V TK, TM, TH, TL, TF, TG; TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ))); TM = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI))); TF = VADD(Tp, Tq); TG = VADD(Tz, TA); TH = VSUB(TF, TG); TL = VADD(TF, TG); ST(&(xo[20]), VSUB(TH, TK), ovs, &(xo[0])); ST(&(xo[8]), VADD(TL, TM), ovs, &(xo[0])); ST(&(xo[4]), VADD(TH, TK), ovs, &(xo[0])); ST(&(xo[16]), VSUB(TL, TM), ovs, &(xo[0])); } } END_SIMD(); }
static const R *t1bv_32(R *ri, R *ii, const R *W, stride ios, int m, int dist) { DVK(KP195090322, +0.195090322016128267848284868477022240927691618); DVK(KP980785280, +0.980785280403230449126182236134239036973933731); DVK(KP555570233, +0.555570233019602224742830813948532874374937191); DVK(KP831469612, +0.831469612302545237078788377617905756738560812); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); int i; R *x; x = ii; BEGIN_SIMD(); for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 62)) { V T4, T1D, T2P, T3h, Tf, T1y, T2K, T3i, TC, T1w, T2G, T3e, Tr, T1v, T2D; V T3d, T1k, T20, T2y, T3a, T1r, T21, T2v, T39, TV, T1X, T2r, T37, T12, T1Y; V T2o, T36; { V T1, T1C, T3, T1A, T1B, T2, T1z, T2N, T2O; T1 = LD(&(x[0]), dist, &(x[0])); T1B = LD(&(x[WS(ios, 24)]), dist, &(x[0])); T1C = BYTW(&(W[TWVL * 46]), T1B); T2 = LD(&(x[WS(ios, 16)]), dist, &(x[0])); T3 = BYTW(&(W[TWVL * 30]), T2); T1z = LD(&(x[WS(ios, 8)]), dist, &(x[0])); T1A = BYTW(&(W[TWVL * 14]), T1z); T4 = VSUB(T1, T3); T1D = VSUB(T1A, T1C); T2N = VADD(T1, T3); T2O = VADD(T1A, T1C); T2P = VSUB(T2N, T2O); T3h = VADD(T2N, T2O); } { V T6, Td, T8, Tb; { V T5, Tc, T7, Ta; T5 = LD(&(x[WS(ios, 4)]), dist, &(x[0])); T6 = BYTW(&(W[TWVL * 6]), T5); Tc = LD(&(x[WS(ios, 12)]), dist, &(x[0])); Td = BYTW(&(W[TWVL * 22]), Tc); T7 = LD(&(x[WS(ios, 20)]), dist, &(x[0])); T8 = BYTW(&(W[TWVL * 38]), T7); Ta = LD(&(x[WS(ios, 28)]), dist, &(x[0])); Tb = BYTW(&(W[TWVL * 54]), Ta); } { V T9, Te, T2I, T2J; T9 = VSUB(T6, T8); Te = VSUB(Tb, Td); Tf = VMUL(LDK(KP707106781), VADD(T9, Te)); T1y = VMUL(LDK(KP707106781), VSUB(T9, Te)); T2I = VADD(T6, T8); T2J = VADD(Tb, Td); T2K = VSUB(T2I, T2J); T3i = VADD(T2I, T2J); } } { V Tt, TA, Tv, Ty; { V Ts, Tz, Tu, Tx; Ts = LD(&(x[WS(ios, 6)]), dist, &(x[0])); Tt = BYTW(&(W[TWVL * 10]), Ts); Tz = LD(&(x[WS(ios, 14)]), dist, &(x[0])); TA = BYTW(&(W[TWVL * 26]), Tz); Tu = LD(&(x[WS(ios, 22)]), dist, &(x[0])); Tv = BYTW(&(W[TWVL * 42]), Tu); Tx = LD(&(x[WS(ios, 30)]), dist, &(x[0])); Ty = BYTW(&(W[TWVL * 58]), Tx); } { V Tw, TB, T2E, T2F; Tw = VSUB(Tt, Tv); TB = VSUB(Ty, TA); TC = VFNMS(LDK(KP382683432), TB, VMUL(LDK(KP923879532), Tw)); T1w = VFMA(LDK(KP923879532), TB, VMUL(LDK(KP382683432), Tw)); T2E = VADD(Ty, TA); T2F = VADD(Tt, Tv); T2G = VSUB(T2E, T2F); T3e = VADD(T2E, T2F); } } { V Ti, Tp, Tk, Tn; { V Th, To, Tj, Tm; Th = LD(&(x[WS(ios, 2)]), dist, &(x[0])); Ti = BYTW(&(W[TWVL * 2]), Th); To = LD(&(x[WS(ios, 26)]), dist, &(x[0])); Tp = BYTW(&(W[TWVL * 50]), To); Tj = LD(&(x[WS(ios, 18)]), dist, &(x[0])); Tk = BYTW(&(W[TWVL * 34]), Tj); Tm = LD(&(x[WS(ios, 10)]), dist, &(x[0])); Tn = BYTW(&(W[TWVL * 18]), Tm); } { V Tl, Tq, T2B, T2C; Tl = VSUB(Ti, Tk); Tq = VSUB(Tn, Tp); Tr = VFMA(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tq)); T1v = VFNMS(LDK(KP382683432), Tq, VMUL(LDK(KP923879532), Tl)); T2B = VADD(Ti, Tk); T2C = VADD(Tn, Tp); T2D = VSUB(T2B, T2C); T3d = VADD(T2B, T2C); } } { V T1g, T1i, T1o, T1m, T1a, T1c, T1d, T15, T17, T18; { V T1f, T1h, T1n, T1l; T1f = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)])); T1g = BYTW(&(W[TWVL * 12]), T1f); T1h = LD(&(x[WS(ios, 23)]), dist, &(x[WS(ios, 1)])); T1i = BYTW(&(W[TWVL * 44]), T1h); T1n = LD(&(x[WS(ios, 15)]), dist, &(x[WS(ios, 1)])); T1o = BYTW(&(W[TWVL * 28]), T1n); T1l = LD(&(x[WS(ios, 31)]), dist, &(x[WS(ios, 1)])); T1m = BYTW(&(W[TWVL * 60]), T1l); { V T19, T1b, T14, T16; T19 = LD(&(x[WS(ios, 27)]), dist, &(x[WS(ios, 1)])); T1a = BYTW(&(W[TWVL * 52]), T19); T1b = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)])); T1c = BYTW(&(W[TWVL * 20]), T1b); T1d = VSUB(T1a, T1c); T14 = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)])); T15 = BYTW(&(W[TWVL * 4]), T14); T16 = LD(&(x[WS(ios, 19)]), dist, &(x[WS(ios, 1)])); T17 = BYTW(&(W[TWVL * 36]), T16); T18 = VSUB(T15, T17); } } { V T1e, T1j, T2w, T2x; T1e = VMUL(LDK(KP707106781), VSUB(T18, T1d)); T1j = VSUB(T1g, T1i); T1k = VSUB(T1e, T1j); T20 = VADD(T1j, T1e); T2w = VADD(T15, T17); T2x = VADD(T1a, T1c); T2y = VSUB(T2w, T2x); T3a = VADD(T2w, T2x); } { V T1p, T1q, T2t, T2u; T1p = VSUB(T1m, T1o); T1q = VMUL(LDK(KP707106781), VADD(T18, T1d)); T1r = VSUB(T1p, T1q); T21 = VADD(T1p, T1q); T2t = VADD(T1m, T1o); T2u = VADD(T1g, T1i); T2v = VSUB(T2t, T2u); T39 = VADD(T2t, T2u); } } { V TR, TT, TZ, TX, TL, TN, TO, TG, TI, TJ; { V TQ, TS, TY, TW; TQ = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)])); TR = BYTW(&(W[TWVL * 16]), TQ); TS = LD(&(x[WS(ios, 25)]), dist, &(x[WS(ios, 1)])); TT = BYTW(&(W[TWVL * 48]), TS); TY = LD(&(x[WS(ios, 17)]), dist, &(x[WS(ios, 1)])); TZ = BYTW(&(W[TWVL * 32]), TY); TW = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); TX = BYTW(&(W[0]), TW); { V TK, TM, TF, TH; TK = LD(&(x[WS(ios, 29)]), dist, &(x[WS(ios, 1)])); TL = BYTW(&(W[TWVL * 56]), TK); TM = LD(&(x[WS(ios, 13)]), dist, &(x[WS(ios, 1)])); TN = BYTW(&(W[TWVL * 24]), TM); TO = VSUB(TL, TN); TF = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)])); TG = BYTW(&(W[TWVL * 8]), TF); TH = LD(&(x[WS(ios, 21)]), dist, &(x[WS(ios, 1)])); TI = BYTW(&(W[TWVL * 40]), TH); TJ = VSUB(TG, TI); } } { V TP, TU, T2p, T2q; TP = VMUL(LDK(KP707106781), VSUB(TJ, TO)); TU = VSUB(TR, TT); TV = VSUB(TP, TU); T1X = VADD(TU, TP); T2p = VADD(TG, TI); T2q = VADD(TL, TN); T2r = VSUB(T2p, T2q); T37 = VADD(T2p, T2q); } { V T10, T11, T2m, T2n; T10 = VSUB(TX, TZ); T11 = VMUL(LDK(KP707106781), VADD(TJ, TO)); T12 = VSUB(T10, T11); T1Y = VADD(T10, T11); T2m = VADD(TX, TZ); T2n = VADD(TR, TT); T2o = VSUB(T2m, T2n); T36 = VADD(T2m, T2n); } } { V T3q, T3u, T3t, T3v; { V T3o, T3p, T3r, T3s; T3o = VADD(T3h, T3i); T3p = VADD(T3d, T3e); T3q = VSUB(T3o, T3p); T3u = VADD(T3o, T3p); T3r = VADD(T36, T37); T3s = VADD(T39, T3a); T3t = VBYI(VSUB(T3r, T3s)); T3v = VADD(T3r, T3s); } ST(&(x[WS(ios, 24)]), VSUB(T3q, T3t), dist, &(x[0])); ST(&(x[0]), VADD(T3u, T3v), dist, &(x[0])); ST(&(x[WS(ios, 8)]), VADD(T3q, T3t), dist, &(x[0])); ST(&(x[WS(ios, 16)]), VSUB(T3u, T3v), dist, &(x[0])); } { V T3f, T3j, T3c, T3k, T38, T3b; T3f = VSUB(T3d, T3e); T3j = VSUB(T3h, T3i); T38 = VSUB(T36, T37); T3b = VSUB(T39, T3a); T3c = VMUL(LDK(KP707106781), VSUB(T38, T3b)); T3k = VMUL(LDK(KP707106781), VADD(T38, T3b)); { V T3g, T3l, T3m, T3n; T3g = VBYI(VSUB(T3c, T3f)); T3l = VSUB(T3j, T3k); ST(&(x[WS(ios, 12)]), VADD(T3g, T3l), dist, &(x[0])); ST(&(x[WS(ios, 20)]), VSUB(T3l, T3g), dist, &(x[0])); T3m = VBYI(VADD(T3f, T3c)); T3n = VADD(T3j, T3k); ST(&(x[WS(ios, 4)]), VADD(T3m, T3n), dist, &(x[0])); ST(&(x[WS(ios, 28)]), VSUB(T3n, T3m), dist, &(x[0])); } } { V T2L, T31, T2R, T2Y, T2A, T2Z, T2U, T32, T2H, T2Q; T2H = VMUL(LDK(KP707106781), VSUB(T2D, T2G)); T2L = VSUB(T2H, T2K); T31 = VADD(T2K, T2H); T2Q = VMUL(LDK(KP707106781), VADD(T2D, T2G)); T2R = VSUB(T2P, T2Q); T2Y = VADD(T2P, T2Q); { V T2s, T2z, T2S, T2T; T2s = VFNMS(LDK(KP382683432), T2r, VMUL(LDK(KP923879532), T2o)); T2z = VFMA(LDK(KP923879532), T2v, VMUL(LDK(KP382683432), T2y)); T2A = VSUB(T2s, T2z); T2Z = VADD(T2s, T2z); T2S = VFMA(LDK(KP382683432), T2o, VMUL(LDK(KP923879532), T2r)); T2T = VFNMS(LDK(KP382683432), T2v, VMUL(LDK(KP923879532), T2y)); T2U = VSUB(T2S, T2T); T32 = VADD(T2S, T2T); } { V T2M, T2V, T34, T35; T2M = VBYI(VSUB(T2A, T2L)); T2V = VSUB(T2R, T2U); ST(&(x[WS(ios, 10)]), VADD(T2M, T2V), dist, &(x[0])); ST(&(x[WS(ios, 22)]), VSUB(T2V, T2M), dist, &(x[0])); T34 = VSUB(T2Y, T2Z); T35 = VBYI(VSUB(T32, T31)); ST(&(x[WS(ios, 18)]), VSUB(T34, T35), dist, &(x[0])); ST(&(x[WS(ios, 14)]), VADD(T34, T35), dist, &(x[0])); } { V T2W, T2X, T30, T33; T2W = VBYI(VADD(T2L, T2A)); T2X = VADD(T2R, T2U); ST(&(x[WS(ios, 6)]), VADD(T2W, T2X), dist, &(x[0])); ST(&(x[WS(ios, 26)]), VSUB(T2X, T2W), dist, &(x[0])); T30 = VADD(T2Y, T2Z); T33 = VBYI(VADD(T31, T32)); ST(&(x[WS(ios, 30)]), VSUB(T30, T33), dist, &(x[0])); ST(&(x[WS(ios, 2)]), VADD(T30, T33), dist, &(x[0])); } } { V TE, T1P, T1I, T1Q, T1t, T1M, T1F, T1N; { V Tg, TD, T1G, T1H; Tg = VSUB(T4, Tf); TD = VSUB(Tr, TC); TE = VSUB(Tg, TD); T1P = VADD(Tg, TD); T1G = VFNMS(LDK(KP555570233), TV, VMUL(LDK(KP831469612), T12)); T1H = VFMA(LDK(KP555570233), T1k, VMUL(LDK(KP831469612), T1r)); T1I = VSUB(T1G, T1H); T1Q = VADD(T1G, T1H); } { V T13, T1s, T1x, T1E; T13 = VFMA(LDK(KP831469612), TV, VMUL(LDK(KP555570233), T12)); T1s = VFNMS(LDK(KP555570233), T1r, VMUL(LDK(KP831469612), T1k)); T1t = VSUB(T13, T1s); T1M = VADD(T13, T1s); T1x = VSUB(T1v, T1w); T1E = VSUB(T1y, T1D); T1F = VSUB(T1x, T1E); T1N = VADD(T1E, T1x); } { V T1u, T1J, T1S, T1T; T1u = VADD(TE, T1t); T1J = VBYI(VADD(T1F, T1I)); ST(&(x[WS(ios, 27)]), VSUB(T1u, T1J), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 5)]), VADD(T1u, T1J), dist, &(x[WS(ios, 1)])); T1S = VBYI(VADD(T1N, T1M)); T1T = VADD(T1P, T1Q); ST(&(x[WS(ios, 3)]), VADD(T1S, T1T), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 29)]), VSUB(T1T, T1S), dist, &(x[WS(ios, 1)])); } { V T1K, T1L, T1O, T1R; T1K = VSUB(TE, T1t); T1L = VBYI(VSUB(T1I, T1F)); ST(&(x[WS(ios, 21)]), VSUB(T1K, T1L), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 11)]), VADD(T1K, T1L), dist, &(x[WS(ios, 1)])); T1O = VBYI(VSUB(T1M, T1N)); T1R = VSUB(T1P, T1Q); ST(&(x[WS(ios, 13)]), VADD(T1O, T1R), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 19)]), VSUB(T1R, T1O), dist, &(x[WS(ios, 1)])); } } { V T1W, T2h, T2a, T2i, T23, T2e, T27, T2f; { V T1U, T1V, T28, T29; T1U = VADD(T4, Tf); T1V = VADD(T1v, T1w); T1W = VSUB(T1U, T1V); T2h = VADD(T1U, T1V); T28 = VFNMS(LDK(KP195090322), T1X, VMUL(LDK(KP980785280), T1Y)); T29 = VFMA(LDK(KP195090322), T20, VMUL(LDK(KP980785280), T21)); T2a = VSUB(T28, T29); T2i = VADD(T28, T29); } { V T1Z, T22, T25, T26; T1Z = VFMA(LDK(KP980785280), T1X, VMUL(LDK(KP195090322), T1Y)); T22 = VFNMS(LDK(KP195090322), T21, VMUL(LDK(KP980785280), T20)); T23 = VSUB(T1Z, T22); T2e = VADD(T1Z, T22); T25 = VADD(Tr, TC); T26 = VADD(T1D, T1y); T27 = VSUB(T25, T26); T2f = VADD(T26, T25); } { V T24, T2b, T2k, T2l; T24 = VADD(T1W, T23); T2b = VBYI(VADD(T27, T2a)); ST(&(x[WS(ios, 25)]), VSUB(T24, T2b), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 7)]), VADD(T24, T2b), dist, &(x[WS(ios, 1)])); T2k = VBYI(VADD(T2f, T2e)); T2l = VADD(T2h, T2i); ST(&(x[WS(ios, 1)]), VADD(T2k, T2l), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 31)]), VSUB(T2l, T2k), dist, &(x[WS(ios, 1)])); } { V T2c, T2d, T2g, T2j; T2c = VSUB(T1W, T23); T2d = VBYI(VSUB(T2a, T27)); ST(&(x[WS(ios, 23)]), VSUB(T2c, T2d), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 9)]), VADD(T2c, T2d), dist, &(x[WS(ios, 1)])); T2g = VBYI(VSUB(T2e, T2f)); T2j = VSUB(T2h, T2i); ST(&(x[WS(ios, 15)]), VADD(T2g, T2j), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 17)]), VSUB(T2j, T2g), dist, &(x[WS(ios, 1)])); } } } END_SIMD(); return W; }
static void n1fv_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); int i; const R *xi; R *xo; xi = ri; xo = ro; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V Tp, T13, Tu, TN, Tm, T14, Tv, TY, T7, T17, Ty, TT, Te, T16, Tx; V TQ; { V Tn, To, TM, Ts, Tt, TL; Tn = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); To = LD(&(xi[WS(is, 12)]), ivs, &(xi[0])); TM = VADD(Tn, To); Ts = LD(&(xi[0]), ivs, &(xi[0])); Tt = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); TL = VADD(Ts, Tt); Tp = VSUB(Tn, To); T13 = VADD(TL, TM); Tu = VSUB(Ts, Tt); TN = VSUB(TL, TM); } { V Ti, TW, Tl, TX; { V Tg, Th, Tj, Tk; Tg = LD(&(xi[WS(is, 14)]), ivs, &(xi[0])); Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); Ti = VSUB(Tg, Th); TW = VADD(Tg, Th); Tj = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); Tk = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); Tl = VSUB(Tj, Tk); TX = VADD(Tj, Tk); } Tm = VMUL(LDK(KP707106781), VSUB(Ti, Tl)); T14 = VADD(TX, TW); Tv = VMUL(LDK(KP707106781), VADD(Tl, Ti)); TY = VSUB(TW, TX); } { V T3, TR, T6, TS; { V T1, T2, T4, T5; T1 = LD(&(xi[WS(is, 15)]), ivs, &(xi[WS(is, 1)])); T2 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T3 = VSUB(T1, T2); TR = VADD(T1, T2); T4 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); T6 = VSUB(T4, T5); TS = VADD(T4, T5); } T7 = VFNMS(LDK(KP923879532), T6, VMUL(LDK(KP382683432), T3)); T17 = VADD(TR, TS); Ty = VFMA(LDK(KP923879532), T3, VMUL(LDK(KP382683432), T6)); TT = VSUB(TR, TS); } { V Ta, TO, Td, TP; { V T8, T9, Tb, Tc; T8 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T9 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Ta = VSUB(T8, T9); TO = VADD(T8, T9); Tb = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); Tc = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)])); Td = VSUB(Tb, Tc); TP = VADD(Tb, Tc); } Te = VFMA(LDK(KP382683432), Ta, VMUL(LDK(KP923879532), Td)); T16 = VADD(TO, TP); Tx = VFNMS(LDK(KP382683432), Td, VMUL(LDK(KP923879532), Ta)); TQ = VSUB(TO, TP); } { V T15, T18, T19, T1a; T15 = VADD(T13, T14); T18 = VADD(T16, T17); ST(&(xo[WS(os, 8)]), VSUB(T15, T18), ovs, &(xo[0])); ST(&(xo[0]), VADD(T15, T18), ovs, &(xo[0])); T19 = VSUB(T13, T14); T1a = VBYI(VSUB(T17, T16)); ST(&(xo[WS(os, 12)]), VSUB(T19, T1a), ovs, &(xo[0])); ST(&(xo[WS(os, 4)]), VADD(T19, T1a), ovs, &(xo[0])); } { V TV, T11, T10, T12, TU, TZ; TU = VMUL(LDK(KP707106781), VADD(TQ, TT)); TV = VADD(TN, TU); T11 = VSUB(TN, TU); TZ = VMUL(LDK(KP707106781), VSUB(TT, TQ)); T10 = VBYI(VADD(TY, TZ)); T12 = VBYI(VSUB(TZ, TY)); ST(&(xo[WS(os, 14)]), VSUB(TV, T10), ovs, &(xo[0])); ST(&(xo[WS(os, 6)]), VADD(T11, T12), ovs, &(xo[0])); ST(&(xo[WS(os, 2)]), VADD(TV, T10), ovs, &(xo[0])); ST(&(xo[WS(os, 10)]), VSUB(T11, T12), ovs, &(xo[0])); } { V Tr, TB, TA, TC; { V Tf, Tq, Tw, Tz; Tf = VSUB(T7, Te); Tq = VSUB(Tm, Tp); Tr = VBYI(VSUB(Tf, Tq)); TB = VBYI(VADD(Tq, Tf)); Tw = VADD(Tu, Tv); Tz = VADD(Tx, Ty); TA = VSUB(Tw, Tz); TC = VADD(Tw, Tz); } ST(&(xo[WS(os, 7)]), VADD(Tr, TA), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 15)]), VSUB(TC, TB), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 9)]), VSUB(TA, Tr), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 1)]), VADD(TB, TC), ovs, &(xo[WS(os, 1)])); } { V TF, TJ, TI, TK; { V TD, TE, TG, TH; TD = VSUB(Tu, Tv); TE = VADD(Te, T7); TF = VADD(TD, TE); TJ = VSUB(TD, TE); TG = VADD(Tp, Tm); TH = VSUB(Ty, Tx); TI = VBYI(VADD(TG, TH)); TK = VBYI(VSUB(TH, TG)); } ST(&(xo[WS(os, 13)]), VSUB(TF, TI), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 5)]), VADD(TJ, TK), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 3)]), VADD(TF, TI), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 11)]), VSUB(TJ, TK), ovs, &(xo[WS(os, 1)])); } } END_SIMD(); }
static void n1bv_12(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V T5, Ta, TG, TF, Ty, Tm, Ti, Tp, TJ, TI, Tx, Ts; { V T1, T6, T4, Tk, T9, Tl; T1 = LD(&(xi[0]), ivs, &(xi[0])); T6 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); { V T2, T3, T7, T8; T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T3 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T4 = VADD(T2, T3); Tk = VSUB(T2, T3); T7 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T9 = VADD(T7, T8); Tl = VSUB(T7, T8); } T5 = VFNMS(LDK(KP500000000), T4, T1); Ta = VFNMS(LDK(KP500000000), T9, T6); TG = VADD(T6, T9); TF = VADD(T1, T4); Ty = VADD(Tk, Tl); Tm = VMUL(LDK(KP866025403), VSUB(Tk, Tl)); } { V Tn, Tq, Te, To, Th, Tr; Tn = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); Tq = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); { V Tc, Td, Tf, Tg; Tc = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); Td = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); Te = VSUB(Tc, Td); To = VADD(Tc, Td); Tf = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Tg = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); Th = VSUB(Tf, Tg); Tr = VADD(Tf, Tg); } Ti = VMUL(LDK(KP866025403), VSUB(Te, Th)); Tp = VFNMS(LDK(KP500000000), To, Tn); TJ = VADD(Tq, Tr); TI = VADD(Tn, To); Tx = VADD(Te, Th); Ts = VFNMS(LDK(KP500000000), Tr, Tq); } { V TH, TK, TL, TM; TH = VSUB(TF, TG); TK = VBYI(VSUB(TI, TJ)); ST(&(xo[WS(os, 3)]), VSUB(TH, TK), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 9)]), VADD(TH, TK), ovs, &(xo[WS(os, 1)])); TL = VADD(TF, TG); TM = VADD(TI, TJ); ST(&(xo[WS(os, 6)]), VSUB(TL, TM), ovs, &(xo[0])); ST(&(xo[0]), VADD(TL, TM), ovs, &(xo[0])); } { V Tj, Tv, Tu, Tw, Tb, Tt; Tb = VSUB(T5, Ta); Tj = VSUB(Tb, Ti); Tv = VADD(Tb, Ti); Tt = VSUB(Tp, Ts); Tu = VBYI(VADD(Tm, Tt)); Tw = VBYI(VSUB(Tt, Tm)); ST(&(xo[WS(os, 11)]), VSUB(Tj, Tu), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 5)]), VADD(Tv, Tw), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 1)]), VADD(Tj, Tu), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 7)]), VSUB(Tv, Tw), ovs, &(xo[WS(os, 1)])); } { V Tz, TD, TC, TE, TA, TB; Tz = VBYI(VMUL(LDK(KP866025403), VSUB(Tx, Ty))); TD = VBYI(VMUL(LDK(KP866025403), VADD(Ty, Tx))); TA = VADD(T5, Ta); TB = VADD(Tp, Ts); TC = VSUB(TA, TB); TE = VADD(TA, TB); ST(&(xo[WS(os, 2)]), VADD(Tz, TC), ovs, &(xo[0])); ST(&(xo[WS(os, 8)]), VSUB(TE, TD), ovs, &(xo[0])); ST(&(xo[WS(os, 10)]), VSUB(TC, Tz), ovs, &(xo[0])); ST(&(xo[WS(os, 4)]), VADD(TD, TE), ovs, &(xo[0])); } } END_SIMD(); }
static const R *t1bv_12(R *ri, R *ii, const R *W, stride ios, int m, int dist) { DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); int i; R *x; x = ii; BEGIN_SIMD(); for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 22)) { V T1, Tt, T6, T7, TB, Tq, TC, TD, T9, Tu, Te, Tf, Tx, Tl, Ty; V Tz; { V T5, T3, T4, T2; T1 = LD(&(x[0]), dist, &(x[0])); T4 = LD(&(x[WS(ios, 8)]), dist, &(x[0])); T5 = BYTW(&(W[TWVL * 14]), T4); T2 = LD(&(x[WS(ios, 4)]), dist, &(x[0])); T3 = BYTW(&(W[TWVL * 6]), T2); Tt = VSUB(T3, T5); T6 = VADD(T3, T5); T7 = VFNMS(LDK(KP500000000), T6, T1); } { V Tn, Tp, Tm, TA, To; Tm = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); Tn = BYTW(&(W[0]), Tm); TA = LD(&(x[WS(ios, 9)]), dist, &(x[WS(ios, 1)])); TB = BYTW(&(W[TWVL * 16]), TA); To = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)])); Tp = BYTW(&(W[TWVL * 8]), To); Tq = VSUB(Tn, Tp); TC = VADD(Tn, Tp); TD = VFNMS(LDK(KP500000000), TC, TB); } { V Td, Tb, T8, Tc, Ta; T8 = LD(&(x[WS(ios, 6)]), dist, &(x[0])); T9 = BYTW(&(W[TWVL * 10]), T8); Tc = LD(&(x[WS(ios, 2)]), dist, &(x[0])); Td = BYTW(&(W[TWVL * 2]), Tc); Ta = LD(&(x[WS(ios, 10)]), dist, &(x[0])); Tb = BYTW(&(W[TWVL * 18]), Ta); Tu = VSUB(Tb, Td); Te = VADD(Tb, Td); Tf = VFNMS(LDK(KP500000000), Te, T9); } { V Ti, Tk, Th, Tw, Tj; Th = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)])); Ti = BYTW(&(W[TWVL * 12]), Th); Tw = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)])); Tx = BYTW(&(W[TWVL * 4]), Tw); Tj = LD(&(x[WS(ios, 11)]), dist, &(x[WS(ios, 1)])); Tk = BYTW(&(W[TWVL * 20]), Tj); Tl = VSUB(Ti, Tk); Ty = VADD(Ti, Tk); Tz = VFNMS(LDK(KP500000000), Ty, Tx); } { V Ts, TG, TF, TH; { V Tg, Tr, Tv, TE; Tg = VSUB(T7, Tf); Tr = VMUL(LDK(KP866025403), VSUB(Tl, Tq)); Ts = VSUB(Tg, Tr); TG = VADD(Tg, Tr); Tv = VMUL(LDK(KP866025403), VSUB(Tt, Tu)); TE = VSUB(Tz, TD); TF = VBYI(VADD(Tv, TE)); TH = VBYI(VSUB(TE, Tv)); } ST(&(x[WS(ios, 11)]), VSUB(Ts, TF), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 5)]), VADD(TG, TH), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 1)]), VADD(Ts, TF), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 7)]), VSUB(TG, TH), dist, &(x[WS(ios, 1)])); } { V TS, TW, TV, TX; { V TQ, TR, TT, TU; TQ = VADD(T1, T6); TR = VADD(T9, Te); TS = VSUB(TQ, TR); TW = VADD(TQ, TR); TT = VADD(Tx, Ty); TU = VADD(TB, TC); TV = VBYI(VSUB(TT, TU)); TX = VADD(TT, TU); } ST(&(x[WS(ios, 3)]), VSUB(TS, TV), dist, &(x[WS(ios, 1)])); ST(&(x[0]), VADD(TW, TX), dist, &(x[0])); ST(&(x[WS(ios, 9)]), VADD(TS, TV), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 6)]), VSUB(TW, TX), dist, &(x[0])); } { V TK, TO, TN, TP; { V TI, TJ, TL, TM; TI = VADD(Tl, Tq); TJ = VADD(Tt, Tu); TK = VBYI(VMUL(LDK(KP866025403), VSUB(TI, TJ))); TO = VBYI(VMUL(LDK(KP866025403), VADD(TJ, TI))); TL = VADD(T7, Tf); TM = VADD(Tz, TD); TN = VSUB(TL, TM); TP = VADD(TL, TM); } ST(&(x[WS(ios, 2)]), VADD(TK, TN), dist, &(x[0])); ST(&(x[WS(ios, 8)]), VSUB(TP, TO), dist, &(x[0])); ST(&(x[WS(ios, 10)]), VSUB(TN, TK), dist, &(x[0])); ST(&(x[WS(ios, 4)]), VADD(TO, TP), dist, &(x[0])); } } END_SIMD(); return W; }
static const R *q1fv_4(R *ri, R *ii, const R *W, stride is, stride vs, int m, int dist) { int i; R *x; x = ri; BEGIN_SIMD(); for (i = 0; i < m; i = i + VL, x = x + (VL * dist), W = W + (TWVL * 6)) { V T3, T9, TA, TG, TD, TH, T6, Ta, Te, Tk, Tp, Tv, Ts, Tw, Th; V Tl; { V T1, T2, Ty, Tz; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(is, 2)]), dist, &(x[0])); T3 = VSUB(T1, T2); T9 = VADD(T1, T2); Ty = LD(&(x[WS(vs, 3)]), dist, &(x[WS(vs, 3)])); Tz = LD(&(x[WS(vs, 3) + WS(is, 2)]), dist, &(x[WS(vs, 3)])); TA = VSUB(Ty, Tz); TG = VADD(Ty, Tz); } { V TB, TC, T4, T5; TB = LD(&(x[WS(vs, 3) + WS(is, 1)]), dist, &(x[WS(vs, 3) + WS(is, 1)])); TC = LD(&(x[WS(vs, 3) + WS(is, 3)]), dist, &(x[WS(vs, 3) + WS(is, 1)])); TD = VBYI(VSUB(TB, TC)); TH = VADD(TB, TC); T4 = LD(&(x[WS(is, 1)]), dist, &(x[WS(is, 1)])); T5 = LD(&(x[WS(is, 3)]), dist, &(x[WS(is, 1)])); T6 = VBYI(VSUB(T4, T5)); Ta = VADD(T4, T5); } { V Tc, Td, Tn, To; Tc = LD(&(x[WS(vs, 1)]), dist, &(x[WS(vs, 1)])); Td = LD(&(x[WS(vs, 1) + WS(is, 2)]), dist, &(x[WS(vs, 1)])); Te = VSUB(Tc, Td); Tk = VADD(Tc, Td); Tn = LD(&(x[WS(vs, 2)]), dist, &(x[WS(vs, 2)])); To = LD(&(x[WS(vs, 2) + WS(is, 2)]), dist, &(x[WS(vs, 2)])); Tp = VSUB(Tn, To); Tv = VADD(Tn, To); } { V Tq, Tr, Tf, Tg; Tq = LD(&(x[WS(vs, 2) + WS(is, 1)]), dist, &(x[WS(vs, 2) + WS(is, 1)])); Tr = LD(&(x[WS(vs, 2) + WS(is, 3)]), dist, &(x[WS(vs, 2) + WS(is, 1)])); Ts = VBYI(VSUB(Tq, Tr)); Tw = VADD(Tq, Tr); Tf = LD(&(x[WS(vs, 1) + WS(is, 1)]), dist, &(x[WS(vs, 1) + WS(is, 1)])); Tg = LD(&(x[WS(vs, 1) + WS(is, 3)]), dist, &(x[WS(vs, 1) + WS(is, 1)])); Th = VBYI(VSUB(Tf, Tg)); Tl = VADD(Tf, Tg); } ST(&(x[0]), VADD(T9, Ta), dist, &(x[0])); ST(&(x[WS(is, 1)]), VADD(Tk, Tl), dist, &(x[WS(is, 1)])); ST(&(x[WS(is, 2)]), VADD(Tv, Tw), dist, &(x[0])); ST(&(x[WS(is, 3)]), VADD(TG, TH), dist, &(x[WS(is, 1)])); { V T7, Ti, Tt, TE; T7 = BYTWJ(&(W[0]), VSUB(T3, T6)); ST(&(x[WS(vs, 1)]), T7, dist, &(x[WS(vs, 1)])); Ti = BYTWJ(&(W[0]), VSUB(Te, Th)); ST(&(x[WS(vs, 1) + WS(is, 1)]), Ti, dist, &(x[WS(vs, 1) + WS(is, 1)])); Tt = BYTWJ(&(W[0]), VSUB(Tp, Ts)); ST(&(x[WS(vs, 1) + WS(is, 2)]), Tt, dist, &(x[WS(vs, 1)])); TE = BYTWJ(&(W[0]), VSUB(TA, TD)); ST(&(x[WS(vs, 1) + WS(is, 3)]), TE, dist, &(x[WS(vs, 1) + WS(is, 1)])); } { V T8, Tj, Tu, TF; T8 = BYTWJ(&(W[TWVL * 4]), VADD(T3, T6)); ST(&(x[WS(vs, 3)]), T8, dist, &(x[WS(vs, 3)])); Tj = BYTWJ(&(W[TWVL * 4]), VADD(Te, Th)); ST(&(x[WS(vs, 3) + WS(is, 1)]), Tj, dist, &(x[WS(vs, 3) + WS(is, 1)])); Tu = BYTWJ(&(W[TWVL * 4]), VADD(Tp, Ts)); ST(&(x[WS(vs, 3) + WS(is, 2)]), Tu, dist, &(x[WS(vs, 3)])); TF = BYTWJ(&(W[TWVL * 4]), VADD(TA, TD)); ST(&(x[WS(vs, 3) + WS(is, 3)]), TF, dist, &(x[WS(vs, 3) + WS(is, 1)])); } { V Tb, Tm, Tx, TI; Tb = BYTWJ(&(W[TWVL * 2]), VSUB(T9, Ta)); ST(&(x[WS(vs, 2)]), Tb, dist, &(x[WS(vs, 2)])); Tm = BYTWJ(&(W[TWVL * 2]), VSUB(Tk, Tl)); ST(&(x[WS(vs, 2) + WS(is, 1)]), Tm, dist, &(x[WS(vs, 2) + WS(is, 1)])); Tx = BYTWJ(&(W[TWVL * 2]), VSUB(Tv, Tw)); ST(&(x[WS(vs, 2) + WS(is, 2)]), Tx, dist, &(x[WS(vs, 2)])); TI = BYTWJ(&(W[TWVL * 2]), VSUB(TG, TH)); ST(&(x[WS(vs, 2) + WS(is, 3)]), TI, dist, &(x[WS(vs, 2) + WS(is, 1)])); } } END_SIMD(); return W; }
static void n2bv_14(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, int v, int ivs, int ovs) { DVK(KP900968867, +0.900968867902419126236102319507445051165919162); DVK(KP222520933, +0.222520933956314404288902564496794759466355569); DVK(KP623489801, +0.623489801858733530525004884004239810632274731); DVK(KP781831482, +0.781831482468029808708444526674057750232334519); DVK(KP974927912, +0.974927912181823607018131682993931217232785801); DVK(KP433883739, +0.433883739117558120475768332848358754609990728); int i; const R *xi; R *xo; xi = ii; xo = io; BEGIN_SIMD(); for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs)) { V Tp, Ty, Tl, TL, Tq, TE, T7, TJ, Ts, TB, Te, TK, Tr, TH, Tn; V To; Tn = LD(&(xi[0]), ivs, &(xi[0])); To = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); Tp = VSUB(Tn, To); Ty = VADD(Tn, To); { V Th, TC, Tk, TD; { V Tf, Tg, Ti, Tj; Tf = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); Th = VSUB(Tf, Tg); TC = VADD(Tf, Tg); Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); Tj = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); Tk = VSUB(Ti, Tj); TD = VADD(Ti, Tj); } Tl = VSUB(Th, Tk); TL = VSUB(TD, TC); Tq = VADD(Th, Tk); TE = VADD(TC, TD); } { V T3, Tz, T6, TA; { V T1, T2, T4, T5; T1 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); T3 = VSUB(T1, T2); Tz = VADD(T1, T2); T4 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0])); T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T6 = VSUB(T4, T5); TA = VADD(T4, T5); } T7 = VSUB(T3, T6); TJ = VSUB(Tz, TA); Ts = VADD(T3, T6); TB = VADD(Tz, TA); } { V Ta, TF, Td, TG; { V T8, T9, Tb, Tc; T8 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T9 = LD(&(xi[WS(is, 13)]), ivs, &(xi[WS(is, 1)])); Ta = VSUB(T8, T9); TF = VADD(T8, T9); Tb = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Td = VSUB(Tb, Tc); TG = VADD(Tb, Tc); } Te = VSUB(Ta, Td); TK = VSUB(TG, TF); Tr = VADD(Ta, Td); TH = VADD(TF, TG); } ST(&(xo[14]), VADD(Tp, VADD(Ts, VADD(Tq, Tr))), ovs, &(xo[2])); ST(&(xo[0]), VADD(Ty, VADD(TB, VADD(TE, TH))), ovs, &(xo[0])); { V Tm, Tt, TQ, TP; Tm = VBYI(VFMA(LDK(KP433883739), T7, VFNMS(LDK(KP781831482), Tl, VMUL(LDK(KP974927912), Te)))); Tt = VFMA(LDK(KP623489801), Tq, VFNMS(LDK(KP222520933), Tr, VFNMS(LDK(KP900968867), Ts, Tp))); ST(&(xo[6]), VADD(Tm, Tt), ovs, &(xo[2])); ST(&(xo[22]), VSUB(Tt, Tm), ovs, &(xo[2])); TQ = VBYI(VFMA(LDK(KP974927912), TJ, VFMA(LDK(KP433883739), TL, VMUL(LDK(KP781831482), TK)))); TP = VFMA(LDK(KP623489801), TH, VFNMS(LDK(KP900968867), TE, VFNMS(LDK(KP222520933), TB, Ty))); ST(&(xo[24]), VSUB(TP, TQ), ovs, &(xo[0])); ST(&(xo[4]), VADD(TP, TQ), ovs, &(xo[0])); } { V Tu, Tv, TM, TI; Tu = VBYI(VFMA(LDK(KP781831482), T7, VFMA(LDK(KP974927912), Tl, VMUL(LDK(KP433883739), Te)))); Tv = VFMA(LDK(KP623489801), Ts, VFNMS(LDK(KP900968867), Tr, VFNMS(LDK(KP222520933), Tq, Tp))); ST(&(xo[2]), VADD(Tu, Tv), ovs, &(xo[2])); ST(&(xo[26]), VSUB(Tv, Tu), ovs, &(xo[2])); TM = VBYI(VFNMS(LDK(KP433883739), TK, VFNMS(LDK(KP974927912), TL, VMUL(LDK(KP781831482), TJ)))); TI = VFMA(LDK(KP623489801), TB, VFNMS(LDK(KP900968867), TH, VFNMS(LDK(KP222520933), TE, Ty))); ST(&(xo[12]), VSUB(TI, TM), ovs, &(xo[0])); ST(&(xo[16]), VADD(TI, TM), ovs, &(xo[0])); } { V TO, TN, Tx, Tw; TO = VBYI(VFMA(LDK(KP433883739), TJ, VFNMS(LDK(KP974927912), TK, VMUL(LDK(KP781831482), TL)))); TN = VFMA(LDK(KP623489801), TE, VFNMS(LDK(KP222520933), TH, VFNMS(LDK(KP900968867), TB, Ty))); ST(&(xo[8]), VSUB(TN, TO), ovs, &(xo[0])); ST(&(xo[20]), VADD(TN, TO), ovs, &(xo[0])); Tx = VBYI(VFNMS(LDK(KP781831482), Te, VFNMS(LDK(KP433883739), Tl, VMUL(LDK(KP974927912), T7)))); Tw = VFMA(LDK(KP623489801), Tr, VFNMS(LDK(KP900968867), Tq, VFNMS(LDK(KP222520933), Ts, Tp))); ST(&(xo[10]), VSUB(Tw, Tx), ovs, &(xo[2])); ST(&(xo[18]), VADD(Tx, Tw), ovs, &(xo[2])); } } END_SIMD(); }