static void e10_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP765366864, +0.765366864730179543456919968060797733522689125); DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); DK(KP390180644, +0.390180644032256535696569736954044481855383236); DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP1_111140466, +1.111140466039204449485661627897065748749874382); DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) { E T3, Tj, Tf, Tk, Ta, Tn, Tc, Tm; { E T1, T2, Td, Te; T1 = I[0]; T2 = I[WS(is, 7)]; T3 = T1 - T2; Tj = T1 + T2; Td = I[WS(is, 4)]; Te = I[WS(is, 3)]; Tf = Td - Te; Tk = Td + Te; { E T4, T5, T6, T7, T8, T9; T4 = I[WS(is, 2)]; T5 = I[WS(is, 5)]; T6 = T4 - T5; T7 = I[WS(is, 1)]; T8 = I[WS(is, 6)]; T9 = T7 - T8; Ta = KP707106781 * (T6 + T9); Tn = T7 + T8; Tc = KP707106781 * (T6 - T9); Tm = T4 + T5; } } { E Tb, Tg, Tp, Tq; Tb = T3 - Ta; Tg = Tc - Tf; O[WS(os, 3)] = FNMS(KP1_111140466, Tg, KP1_662939224 * Tb); O[WS(os, 5)] = FMA(KP1_662939224, Tg, KP1_111140466 * Tb); Tp = Tj + Tk; Tq = Tm + Tn; O[WS(os, 4)] = KP1_414213562 * (Tp - Tq); O[0] = KP2_000000000 * (Tp + Tq); } { E Th, Ti, Tl, To; Th = T3 + Ta; Ti = Tf + Tc; O[WS(os, 1)] = FNMS(KP390180644, Ti, KP1_961570560 * Th); O[WS(os, 7)] = FMA(KP1_961570560, Ti, KP390180644 * Th); Tl = Tj - Tk; To = Tm - Tn; O[WS(os, 2)] = FNMS(KP765366864, To, KP1_847759065 * Tl); O[WS(os, 6)] = FMA(KP765366864, Tl, KP1_847759065 * To); } } }
static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP433012701, +0.433012701892219323381861585376468091735701313); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP250000000, +0.250000000000000000000000000000000000000000000); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); INT m; for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(rs)) { V TX, T13, T4, Tf, TZ, TD, TF, T17, TW, T14, Tw, Tl, T10, TL, TN; V T16; { V T1, T3, TA, Tb, Td, Te, T9, TC, T2, Tz, Tc, Ta, T6, T8, T7; V T5, TB, TE, Ti, Tk, TI, Ts, Tu, Tv, Tq, TK, Tj, TH, Tt, Tr; V Tn, Tp, To, Tm, TJ, Th, TM; T1 = LD(&(Rp[0]), ms, &(Rp[0])); T2 = LD(&(Rm[0]), -ms, &(Rm[0])); T3 = VCONJ(T2); Tz = LDW(&(W[0])); TA = VZMULIJ(Tz, VSUB(T3, T1)); Tb = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); Tc = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); Td = VCONJ(Tc); Ta = LDW(&(W[TWVL * 14])); Te = VZMULJ(Ta, VADD(Tb, Td)); T6 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); T8 = VCONJ(T7); T5 = LDW(&(W[TWVL * 6])); T9 = VZMULJ(T5, VADD(T6, T8)); TB = LDW(&(W[TWVL * 8])); TC = VZMULIJ(TB, VSUB(T8, T6)); TX = VSUB(TC, TA); T13 = VSUB(Te, T9); T4 = VADD(T1, T3); Tf = VADD(T9, Te); TZ = VFNMS(LDK(KP250000000), Tf, VMUL(LDK(KP500000000), T4)); TD = VADD(TA, TC); TE = LDW(&(W[TWVL * 16])); TF = VZMULIJ(TE, VSUB(Td, Tb)); T17 = VFNMS(LDK(KP500000000), TD, TF); Ti = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); Tj = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); Tk = VCONJ(Tj); TH = LDW(&(W[TWVL * 12])); TI = VZMULIJ(TH, VSUB(Tk, Ti)); Ts = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); Tt = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); Tu = VCONJ(Tt); Tr = LDW(&(W[TWVL * 2])); Tv = VZMULJ(Tr, VADD(Ts, Tu)); Tn = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); To = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); Tp = VCONJ(To); Tm = LDW(&(W[TWVL * 18])); Tq = VZMULJ(Tm, VADD(Tn, Tp)); TJ = LDW(&(W[TWVL * 20])); TK = VZMULIJ(TJ, VSUB(Tp, Tn)); TW = VSUB(TK, TI); T14 = VSUB(Tv, Tq); Tw = VADD(Tq, Tv); Th = LDW(&(W[TWVL * 10])); Tl = VZMULJ(Th, VADD(Ti, Tk)); T10 = VFNMS(LDK(KP250000000), Tw, VMUL(LDK(KP500000000), Tl)); TL = VADD(TI, TK); TM = LDW(&(W[TWVL * 4])); TN = VZMULIJ(TM, VSUB(Tu, Ts)); T16 = VFNMS(LDK(KP500000000), TL, TN); } { V Ty, TS, TP, TT, Tg, Tx, TG, TO, TQ, TV, TR, TU, T1i, T1o, T1l; V T1p, T1g, T1h, T1j, T1k, T1m, T1r, T1n, T1q, T12, T1c, T19, T1d, TY, T11; V T15, T18, T1a, T1f, T1b, T1e; Tg = VADD(T4, Tf); Tx = VADD(Tl, Tw); Ty = VADD(Tg, Tx); TS = VSUB(Tg, Tx); TG = VADD(TD, TF); TO = VADD(TL, TN); TP = VADD(TG, TO); TT = VBYI(VSUB(TO, TG)); TQ = VCONJ(VMUL(LDK(KP500000000), VSUB(Ty, TP))); ST(&(Rm[WS(rs, 5)]), TQ, -ms, &(Rm[WS(rs, 1)])); TV = VMUL(LDK(KP500000000), VADD(TS, TT)); ST(&(Rp[WS(rs, 3)]), TV, ms, &(Rp[WS(rs, 1)])); TR = VMUL(LDK(KP500000000), VADD(Ty, TP)); ST(&(Rp[0]), TR, ms, &(Rp[0])); TU = VCONJ(VMUL(LDK(KP500000000), VSUB(TS, TT))); ST(&(Rm[WS(rs, 2)]), TU, -ms, &(Rm[0])); T1g = VADD(TX, TW); T1h = VADD(T13, T14); T1i = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VSUB(T1g, T1h)))); T1o = VMUL(LDK(KP500000000), VBYI(VMUL(LDK(KP866025403), VADD(T1g, T1h)))); T1j = VADD(TZ, T10); T1k = VMUL(LDK(KP500000000), VADD(T17, T16)); T1l = VSUB(T1j, T1k); T1p = VADD(T1j, T1k); T1m = VADD(T1i, T1l); ST(&(Rp[WS(rs, 2)]), T1m, ms, &(Rp[0])); T1r = VCONJ(VSUB(T1p, T1o)); ST(&(Rm[WS(rs, 3)]), T1r, -ms, &(Rm[WS(rs, 1)])); T1n = VCONJ(VSUB(T1l, T1i)); ST(&(Rm[WS(rs, 1)]), T1n, -ms, &(Rm[WS(rs, 1)])); T1q = VADD(T1o, T1p); ST(&(Rp[WS(rs, 4)]), T1q, ms, &(Rp[0])); TY = VMUL(LDK(KP433012701), VSUB(TW, TX)); T11 = VSUB(TZ, T10); T12 = VADD(TY, T11); T1c = VSUB(T11, TY); T15 = VMUL(LDK(KP866025403), VSUB(T13, T14)); T18 = VSUB(T16, T17); T19 = VMUL(LDK(KP500000000), VBYI(VSUB(T15, T18))); T1d = VMUL(LDK(KP500000000), VBYI(VADD(T15, T18))); T1a = VCONJ(VSUB(T12, T19)); ST(&(Rm[0]), T1a, -ms, &(Rm[0])); T1f = VCONJ(VADD(T1c, T1d)); ST(&(Rm[WS(rs, 4)]), T1f, -ms, &(Rm[0])); T1b = VADD(T12, T19); ST(&(Rp[WS(rs, 1)]), T1b, ms, &(Rp[WS(rs, 1)])); T1e = VSUB(T1c, T1d); ST(&(Rp[WS(rs, 5)]), T1e, ms, &(Rp[WS(rs, 1)])); } } }
static const R *t1_10(R *ri, R *ii, const R *W, stride ios, INT m, INT dist) { DK(KP587785252, +0.587785252292473129168705954639072768597652438); DK(KP951056516, +0.951056516295153572116439333379382143405698634); DK(KP250000000, +0.250000000000000000000000000000000000000000000); DK(KP559016994, +0.559016994374947424102293417182819058860154590); INT i; for (i = m; i > 0; i = i - 1, ri = ri + dist, ii = ii + dist, W = W + 18, MAKE_VOLATILE_STRIDE(ios)) { E T7, T1O, TT, T1C, TF, TQ, TR, T1o, T1p, T1y, TX, TY, TZ, T1d, T1g; E T1M, Ti, Tt, Tu, T1r, T1s, T1x, TU, TV, TW, T16, T19, T1L; { E T1, T1B, T6, T1A; T1 = ri[0]; T1B = ii[0]; { E T3, T5, T2, T4; T3 = ri[WS(ios, 5)]; T5 = ii[WS(ios, 5)]; T2 = W[8]; T4 = W[9]; T6 = FMA(T2, T3, T4 * T5); T1A = FNMS(T4, T3, T2 * T5); } T7 = T1 - T6; T1O = T1B - T1A; TT = T1 + T6; T1C = T1A + T1B; } { E Tz, T1b, TP, T1f, TE, T1c, TK, T1e; { E Tw, Ty, Tv, Tx; Tw = ri[WS(ios, 4)]; Ty = ii[WS(ios, 4)]; Tv = W[6]; Tx = W[7]; Tz = FMA(Tv, Tw, Tx * Ty); T1b = FNMS(Tx, Tw, Tv * Ty); } { E TM, TO, TL, TN; TM = ri[WS(ios, 1)]; TO = ii[WS(ios, 1)]; TL = W[0]; TN = W[1]; TP = FMA(TL, TM, TN * TO); T1f = FNMS(TN, TM, TL * TO); } { E TB, TD, TA, TC; TB = ri[WS(ios, 9)]; TD = ii[WS(ios, 9)]; TA = W[16]; TC = W[17]; TE = FMA(TA, TB, TC * TD); T1c = FNMS(TC, TB, TA * TD); } { E TH, TJ, TG, TI; TH = ri[WS(ios, 6)]; TJ = ii[WS(ios, 6)]; TG = W[10]; TI = W[11]; TK = FMA(TG, TH, TI * TJ); T1e = FNMS(TI, TH, TG * TJ); } TF = Tz - TE; TQ = TK - TP; TR = TF + TQ; T1o = T1b + T1c; T1p = T1e + T1f; T1y = T1o + T1p; TX = Tz + TE; TY = TK + TP; TZ = TX + TY; T1d = T1b - T1c; T1g = T1e - T1f; T1M = T1d + T1g; } { E Tc, T14, Ts, T18, Th, T15, Tn, T17; { E T9, Tb, T8, Ta; T9 = ri[WS(ios, 2)]; Tb = ii[WS(ios, 2)]; T8 = W[2]; Ta = W[3]; Tc = FMA(T8, T9, Ta * Tb); T14 = FNMS(Ta, T9, T8 * Tb); } { E Tp, Tr, To, Tq; Tp = ri[WS(ios, 3)]; Tr = ii[WS(ios, 3)]; To = W[4]; Tq = W[5]; Ts = FMA(To, Tp, Tq * Tr); T18 = FNMS(Tq, Tp, To * Tr); } { E Te, Tg, Td, Tf; Te = ri[WS(ios, 7)]; Tg = ii[WS(ios, 7)]; Td = W[12]; Tf = W[13]; Th = FMA(Td, Te, Tf * Tg); T15 = FNMS(Tf, Te, Td * Tg); } { E Tk, Tm, Tj, Tl; Tk = ri[WS(ios, 8)]; Tm = ii[WS(ios, 8)]; Tj = W[14]; Tl = W[15]; Tn = FMA(Tj, Tk, Tl * Tm); T17 = FNMS(Tl, Tk, Tj * Tm); } Ti = Tc - Th; Tt = Tn - Ts; Tu = Ti + Tt; T1r = T14 + T15; T1s = T17 + T18; T1x = T1r + T1s; TU = Tc + Th; TV = Tn + Ts; TW = TU + TV; T16 = T14 - T15; T19 = T17 - T18; T1L = T16 + T19; } { E T11, TS, T12, T1i, T1k, T1a, T1h, T1j, T13; T11 = KP559016994 * (Tu - TR); TS = Tu + TR; T12 = FNMS(KP250000000, TS, T7); T1a = T16 - T19; T1h = T1d - T1g; T1i = FMA(KP951056516, T1a, KP587785252 * T1h); T1k = FNMS(KP587785252, T1a, KP951056516 * T1h); ri[WS(ios, 5)] = T7 + TS; T1j = T12 - T11; ri[WS(ios, 7)] = T1j - T1k; ri[WS(ios, 3)] = T1j + T1k; T13 = T11 + T12; ri[WS(ios, 9)] = T13 - T1i; ri[WS(ios, 1)] = T13 + T1i; } { E T1N, T1P, T1Q, T1U, T1W, T1S, T1T, T1V, T1R; T1N = KP559016994 * (T1L - T1M); T1P = T1L + T1M; T1Q = FNMS(KP250000000, T1P, T1O); T1S = Ti - Tt; T1T = TF - TQ; T1U = FMA(KP951056516, T1S, KP587785252 * T1T); T1W = FNMS(KP587785252, T1S, KP951056516 * T1T); ii[WS(ios, 5)] = T1P + T1O; T1V = T1Q - T1N; ii[WS(ios, 3)] = T1V - T1W; ii[WS(ios, 7)] = T1W + T1V; T1R = T1N + T1Q; ii[WS(ios, 1)] = T1R - T1U; ii[WS(ios, 9)] = T1U + T1R; } { E T1m, T10, T1l, T1u, T1w, T1q, T1t, T1v, T1n; T1m = KP559016994 * (TW - TZ); T10 = TW + TZ; T1l = FNMS(KP250000000, T10, TT); T1q = T1o - T1p; T1t = T1r - T1s; T1u = FNMS(KP587785252, T1t, KP951056516 * T1q); T1w = FMA(KP951056516, T1t, KP587785252 * T1q); ri[0] = TT + T10; T1v = T1m + T1l; ri[WS(ios, 4)] = T1v - T1w; ri[WS(ios, 6)] = T1v + T1w; T1n = T1l - T1m; ri[WS(ios, 2)] = T1n - T1u; ri[WS(ios, 8)] = T1n + T1u; } { E T1H, T1z, T1G, T1F, T1J, T1D, T1E, T1K, T1I; T1H = KP559016994 * (T1x - T1y); T1z = T1x + T1y; T1G = FNMS(KP250000000, T1z, T1C); T1D = TX - TY; T1E = TU - TV; T1F = FNMS(KP587785252, T1E, KP951056516 * T1D); T1J = FMA(KP951056516, T1E, KP587785252 * T1D); ii[0] = T1z + T1C; T1K = T1H + T1G; ii[WS(ios, 4)] = T1J + T1K; ii[WS(ios, 6)] = T1K - T1J; T1I = T1G - T1H; ii[WS(ios, 2)] = T1F + T1I; ii[WS(ios, 8)] = T1I - T1F; } } return W; }
static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT m; for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(rs)) { E T1, T3, T2, T4, T5, T9; T1 = W[0]; T3 = W[1]; T2 = W[2]; T4 = W[3]; T5 = FMA(T1, T2, T3 * T4); T9 = FNMS(T3, T2, T1 * T4); { E Tg, Tr, Tm, Tx, Td, Tw, Tp, Ts; { E Te, Tf, Tl, Ti, Tj, Tk; Te = Ip[0]; Tf = Im[0]; Tl = Te + Tf; Ti = Rm[0]; Tj = Rp[0]; Tk = Ti - Tj; Tg = Te - Tf; Tr = Tj + Ti; Tm = FNMS(T3, Tl, T1 * Tk); Tx = FMA(T3, Tk, T1 * Tl); } { E T8, To, Tc, Tn; { E T6, T7, Ta, Tb; T6 = Ip[WS(rs, 1)]; T7 = Im[WS(rs, 1)]; T8 = T6 - T7; To = T6 + T7; Ta = Rp[WS(rs, 1)]; Tb = Rm[WS(rs, 1)]; Tc = Ta + Tb; Tn = Ta - Tb; } Td = FNMS(T9, Tc, T5 * T8); Tw = FNMS(T4, Tn, T2 * To); Tp = FMA(T2, Tn, T4 * To); Ts = FMA(T5, Tc, T9 * T8); } { E Th, Tq, Tz, TA; Th = Td + Tg; Tq = Tm - Tp; Ip[0] = KP500000000 * (Th + Tq); Im[WS(rs, 1)] = KP500000000 * (Tq - Th); Tz = Tr + Ts; TA = Tw + Tx; Rm[WS(rs, 1)] = KP500000000 * (Tz - TA); Rp[0] = KP500000000 * (Tz + TA); } { E Tt, Tu, Tv, Ty; Tt = Tr - Ts; Tu = Tp + Tm; Rm[0] = KP500000000 * (Tt - Tu); Rp[WS(rs, 1)] = KP500000000 * (Tt + Tu); Tv = Tg - Td; Ty = Tw - Tx; Ip[WS(rs, 1)] = KP500000000 * (Tv + Ty); Im[0] = KP500000000 * (Ty - Tv); } } } } }
static void r2cbIII_4(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); { INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(16, rs), MAKE_VOLATILE_STRIDE(16, csr), MAKE_VOLATILE_STRIDE(16, csi)) { E T1, T2, T3, T4, T5, T6; T1 = Cr[0]; T2 = Cr[WS(csr, 1)]; T3 = T1 - T2; T4 = Ci[0]; T5 = Ci[WS(csi, 1)]; T6 = T4 + T5; R0[0] = KP2_000000000 * (T1 + T2); R0[WS(rs, 1)] = KP2_000000000 * (T5 - T4); R1[0] = KP1_414213562 * (T3 - T6); R1[WS(rs, 1)] = -(KP1_414213562 * (T3 + T6)); } } }
static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms) { { INT m; for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) { E To, Te, Tm, T8, Ty, Tw, Tq, Tk; { E T1, Tv, Tu, T7, Tg, Tj, Tf, Ti, Tp, Th; T1 = cr[0]; Tv = ci[0]; { E T3, T6, T2, T5; T3 = cr[WS(rs, 2)]; T6 = ci[WS(rs, 2)]; T2 = W[2]; T5 = W[3]; { E Ta, Td, Tc, Tn, Tb, Tt, T4, T9; Ta = cr[WS(rs, 1)]; Td = ci[WS(rs, 1)]; Tt = T2 * T6; T4 = T2 * T3; T9 = W[0]; Tc = W[1]; Tu = FNMS(T5, T3, Tt); T7 = FMA(T5, T6, T4); Tn = T9 * Td; Tb = T9 * Ta; Tg = cr[WS(rs, 3)]; Tj = ci[WS(rs, 3)]; To = FNMS(Tc, Ta, Tn); Te = FMA(Tc, Td, Tb); Tf = W[4]; Ti = W[5]; } } Tm = T1 - T7; T8 = T1 + T7; Tp = Tf * Tj; Th = Tf * Tg; Ty = Tv - Tu; Tw = Tu + Tv; Tq = FNMS(Ti, Tg, Tp); Tk = FMA(Ti, Tj, Th); } { E Tr, Ts, Tl, Tx; Tr = To - Tq; Ts = To + Tq; Tl = Te + Tk; Tx = Tk - Te; ci[WS(rs, 3)] = Ts + Tw; cr[WS(rs, 2)] = Ts - Tw; cr[WS(rs, 1)] = Tm + Tr; ci[0] = Tm - Tr; ci[WS(rs, 2)] = Tx + Ty; cr[WS(rs, 3)] = Tx - Ty; cr[0] = T8 + Tl; ci[WS(rs, 1)] = T8 - Tl; } } } }
static const R *t2fv_8(R *ri, R *ii, const R *W, stride ios, INT m, INT dist) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; R *x; x = ri; for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(ios)) { V T4, Tq, Tm, Tr, T9, Tt, Te, Tu, T1, T3, T2; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(ios, 4)]), dist, &(x[0])); T3 = BYTWJ(&(W[TWVL * 6]), T2); T4 = VSUB(T1, T3); Tq = VADD(T1, T3); { V Tj, Tl, Ti, Tk; Ti = LD(&(x[WS(ios, 2)]), dist, &(x[0])); Tj = BYTWJ(&(W[TWVL * 2]), Ti); Tk = LD(&(x[WS(ios, 6)]), dist, &(x[0])); Tl = BYTWJ(&(W[TWVL * 10]), Tk); Tm = VSUB(Tj, Tl); Tr = VADD(Tj, Tl); } { V T6, T8, T5, T7; T5 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); T6 = BYTWJ(&(W[0]), T5); T7 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)])); T8 = BYTWJ(&(W[TWVL * 8]), T7); T9 = VSUB(T6, T8); Tt = VADD(T6, T8); } { V Tb, Td, Ta, Tc; Ta = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)])); Tb = BYTWJ(&(W[TWVL * 12]), Ta); Tc = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)])); Td = BYTWJ(&(W[TWVL * 4]), Tc); Te = VSUB(Tb, Td); Tu = VADD(Tb, Td); } { V Ts, Tv, Tw, Tx; Ts = VADD(Tq, Tr); Tv = VADD(Tt, Tu); ST(&(x[WS(ios, 4)]), VSUB(Ts, Tv), dist, &(x[0])); ST(&(x[0]), VADD(Ts, Tv), dist, &(x[0])); Tw = VSUB(Tq, Tr); Tx = VBYI(VSUB(Tu, Tt)); ST(&(x[WS(ios, 6)]), VSUB(Tw, Tx), dist, &(x[0])); ST(&(x[WS(ios, 2)]), VADD(Tw, Tx), dist, &(x[0])); { V Tg, To, Tn, Tp, Tf, Th; Tf = VMUL(LDK(KP707106781), VADD(T9, Te)); Tg = VADD(T4, Tf); To = VSUB(T4, Tf); Th = VMUL(LDK(KP707106781), VSUB(Te, T9)); Tn = VBYI(VSUB(Th, Tm)); Tp = VBYI(VADD(Tm, Th)); ST(&(x[WS(ios, 7)]), VSUB(Tg, Tn), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 3)]), VADD(To, Tp), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 1)]), VADD(Tg, Tn), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 5)]), VSUB(To, Tp), dist, &(x[WS(ios, 1)])); } } } return W; }
static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT m; for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) { V TJ, T4, Tf, TB, TD, TE, Tm, T1, Tj, TF, Tp, Tb, Tg, Tt, Tx; V T2, T3, Td, Te, T5, T6, T8, T9, Tn, T7, To, Ta, Tk, Tl, TG; V TL, Tq, Tc, Tu, Th, Tv, Ty, Tw, TC, Ti, TK, TA, Tz, TI, TH; V Ts, Tr, TN, TM; T2 = LD(&(Rp[0]), ms, &(Rp[0])); T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); Td = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); Te = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); T5 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); T8 = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); T9 = LD(&(Rm[0]), -ms, &(Rm[0])); TJ = LDW(&(W[0])); Tk = VFMACONJ(T3, T2); T4 = VFNMSCONJ(T3, T2); Tl = VFMACONJ(Te, Td); Tf = VFNMSCONJ(Te, Td); Tn = VFMACONJ(T6, T5); T7 = VFNMSCONJ(T6, T5); To = VFMACONJ(T9, T8); Ta = VFMSCONJ(T9, T8); TB = LDW(&(W[TWVL * 8])); TD = LDW(&(W[TWVL * 6])); TE = VADD(Tk, Tl); Tm = VSUB(Tk, Tl); T1 = LDW(&(W[TWVL * 12])); Tj = LDW(&(W[TWVL * 10])); TF = VADD(Tn, To); Tp = VSUB(Tn, To); Tb = VADD(T7, Ta); Tg = VSUB(T7, Ta); Tt = LDW(&(W[TWVL * 4])); Tx = LDW(&(W[TWVL * 2])); TG = VZMUL(TD, VSUB(TE, TF)); TL = VADD(TE, TF); Tq = VZMUL(Tj, VFNMSI(Tp, Tm)); Tc = VFMA(LDK(KP707106781), Tb, T4); Tu = VFNMS(LDK(KP707106781), Tb, T4); Th = VFMA(LDK(KP707106781), Tg, Tf); Tv = VFNMS(LDK(KP707106781), Tg, Tf); Ty = VZMUL(Tx, VFMAI(Tp, Tm)); Tw = VZMULI(Tt, VFNMSI(Tv, Tu)); TC = VZMULI(TB, VFMAI(Tv, Tu)); Ti = VZMULI(T1, VFNMSI(Th, Tc)); TK = VZMULI(TJ, VFMAI(Th, Tc)); TA = VCONJ(VSUB(Ty, Tw)); Tz = VADD(Tw, Ty); TI = VCONJ(VSUB(TG, TC)); TH = VADD(TC, TG); Ts = VCONJ(VSUB(Tq, Ti)); Tr = VADD(Ti, Tq); TN = VCONJ(VSUB(TL, TK)); TM = VADD(TK, TL); ST(&(Rm[WS(rs, 1)]), TA, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 1)]), Tz, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 2)]), TI, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 2)]), TH, ms, &(Rp[0])); ST(&(Rm[WS(rs, 3)]), Ts, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 3)]), Tr, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[0]), TN, -ms, &(Rm[0])); ST(&(Rp[0]), TM, ms, &(Rp[0])); } } VLEAVE(); }
static void r2cb_8(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T5, Tg, T3, Te, T9, Ti, Td, Tj, T6, Ta; { E T4, Tf, T1, T2; T4 = Cr[WS(csr, 2)]; T5 = KP2_000000000 * T4; Tf = Ci[WS(csi, 2)]; Tg = KP2_000000000 * Tf; T1 = Cr[0]; T2 = Cr[WS(csr, 4)]; T3 = T1 + T2; Te = T1 - T2; { E T7, T8, Tb, Tc; T7 = Cr[WS(csr, 1)]; T8 = Cr[WS(csr, 3)]; T9 = KP2_000000000 * (T7 + T8); Ti = T7 - T8; Tb = Ci[WS(csi, 1)]; Tc = Ci[WS(csi, 3)]; Td = KP2_000000000 * (Tb - Tc); Tj = Tb + Tc; } } T6 = T3 + T5; R0[WS(rs, 2)] = T6 - T9; R0[0] = T6 + T9; Ta = T3 - T5; R0[WS(rs, 1)] = Ta - Td; R0[WS(rs, 3)] = Ta + Td; { E Th, Tk, Tl, Tm; Th = Te - Tg; Tk = KP1_414213562 * (Ti - Tj); R1[WS(rs, 2)] = Th - Tk; R1[0] = Th + Tk; Tl = Te + Tg; Tm = KP1_414213562 * (Ti + Tj); R1[WS(rs, 1)] = Tl - Tm; R1[WS(rs, 3)] = Tl + Tm; } } }
static void t1buv_4(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { { INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 6)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 6), MAKE_VOLATILE_STRIDE(4, rs)) { V T1, T8, T3, T6, T7, T2, T5; T1 = LD(&(x[0]), ms, &(x[0])); T7 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T8 = BYTW(&(W[TWVL * 4]), T7); T2 = LD(&(x[WS(rs, 2)]), ms, &(x[0])); T3 = BYTW(&(W[TWVL * 2]), T2); T5 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T6 = BYTW(&(W[0]), T5); { V T4, T9, Ta, Tb; T4 = VSUB(T1, T3); T9 = VBYI(VSUB(T6, T8)); ST(&(x[WS(rs, 3)]), VSUB(T4, T9), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 1)]), VADD(T4, T9), ms, &(x[WS(rs, 1)])); Ta = VADD(T1, T3); Tb = VADD(T6, T8); ST(&(x[WS(rs, 2)]), VSUB(Ta, Tb), ms, &(x[0])); ST(&(x[0]), VADD(Ta, Tb), ms, &(x[0])); } } } VLEAVE(); }
static void hc2cbdftv_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT m; for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 14)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(32, rs)) { V T5, Tj, Tq, TI, Te, Tk, Tt, TJ, T2, Tg, T4, Ti, T3, Th, To; V Tp, T6, Tc, T8, Tb, T7, Ta, T9, Td, Tr, Ts, TP, Tu, Tm, TO; V Tn, Tf, Tl, T1, TN, Tv, TR, Tw, TQ, TC, TK, TA, TG, TB, TH; V Ty, Tz, Tx, TF, TD, TM, TE, TL; T2 = LD(&(Rp[0]), ms, &(Rp[0])); Tg = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); T3 = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); T4 = VCONJ(T3); Th = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); Ti = VCONJ(Th); T5 = VSUB(T2, T4); Tj = VSUB(Tg, Ti); To = VADD(T2, T4); Tp = VADD(Tg, Ti); Tq = VSUB(To, Tp); TI = VADD(To, Tp); T6 = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); Tc = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); T7 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); T8 = VCONJ(T7); Ta = LD(&(Rm[0]), -ms, &(Rm[0])); Tb = VCONJ(Ta); T9 = VSUB(T6, T8); Td = VSUB(Tb, Tc); Te = VMUL(LDK(KP707106781), VADD(T9, Td)); Tk = VMUL(LDK(KP707106781), VSUB(T9, Td)); Tr = VADD(T6, T8); Ts = VADD(Tb, Tc); Tt = VBYI(VSUB(Tr, Ts)); TJ = VADD(Tr, Ts); TP = VADD(TI, TJ); Tn = LDW(&(W[TWVL * 10])); Tu = VZMUL(Tn, VSUB(Tq, Tt)); Tf = VADD(T5, Te); Tl = VBYI(VADD(Tj, Tk)); T1 = LDW(&(W[TWVL * 12])); Tm = VZMULI(T1, VSUB(Tf, Tl)); TN = LDW(&(W[0])); TO = VZMULI(TN, VADD(Tl, Tf)); Tv = VADD(Tm, Tu); ST(&(Rp[WS(rs, 3)]), Tv, ms, &(Rp[WS(rs, 1)])); TR = VCONJ(VSUB(TP, TO)); ST(&(Rm[0]), TR, -ms, &(Rm[0])); Tw = VCONJ(VSUB(Tu, Tm)); ST(&(Rm[WS(rs, 3)]), Tw, -ms, &(Rm[WS(rs, 1)])); TQ = VADD(TO, TP); ST(&(Rp[0]), TQ, ms, &(Rp[0])); TB = LDW(&(W[TWVL * 2])); TC = VZMUL(TB, VADD(Tq, Tt)); TH = LDW(&(W[TWVL * 6])); TK = VZMUL(TH, VSUB(TI, TJ)); Ty = VBYI(VSUB(Tk, Tj)); Tz = VSUB(T5, Te); Tx = LDW(&(W[TWVL * 4])); TA = VZMULI(Tx, VADD(Ty, Tz)); TF = LDW(&(W[TWVL * 8])); TG = VZMULI(TF, VSUB(Tz, Ty)); TD = VADD(TA, TC); ST(&(Rp[WS(rs, 1)]), TD, ms, &(Rp[WS(rs, 1)])); TM = VCONJ(VSUB(TK, TG)); ST(&(Rm[WS(rs, 2)]), TM, -ms, &(Rm[0])); TE = VCONJ(VSUB(TC, TA)); ST(&(Rm[WS(rs, 1)]), TE, -ms, &(Rm[WS(rs, 1)])); TL = VADD(TG, TK); ST(&(Rp[WS(rs, 2)]), TL, ms, &(Rp[0])); } } VLEAVE(); }
static void r2cbIII_7(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_949855824, +1.949855824363647214036263365987862434465571601); DK(KP1_801937735, +1.801937735804838252472204639014890102331838324); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP692021471, +0.692021471630095869627814897002069140197260599); DK(KP801937735, +0.801937735804838252472204639014890102331838324); DK(KP356895867, +0.356895867892209443894399510021300583399127187); DK(KP554958132, +0.554958132087371191422194871006410481067288862); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E Tn, Td, Tg, Ti, Tl, T8; { E T1, T9, Tb, Ta, T2, T4, Th, Tm, Tc, T3, Te; T1 = Cr[WS(csr, 3)]; T9 = Ci[WS(csi, 1)]; Tb = Ci[0]; Ta = Ci[WS(csi, 2)]; T2 = Cr[WS(csr, 2)]; T4 = Cr[0]; Th = FMA(KP554958132, T9, Tb); Tm = FNMS(KP554958132, Ta, T9); Tc = FMA(KP554958132, Tb, Ta); T3 = Cr[WS(csr, 1)]; Te = FNMS(KP356895867, T2, T4); Tn = FNMS(KP801937735, Tm, Tb); { E Tf, Tk, T7, T5, Tj, T6; Td = FMA(KP801937735, Tc, T9); T5 = T2 + T3 + T4; Tj = FNMS(KP356895867, T4, T3); T6 = FNMS(KP356895867, T3, T2); Tf = FNMS(KP692021471, Te, T3); R0[0] = FMA(KP2_000000000, T5, T1); Tk = FNMS(KP692021471, Tj, T2); T7 = FNMS(KP692021471, T6, T4); Tg = FNMS(KP1_801937735, Tf, T1); Ti = FNMS(KP801937735, Th, Ta); Tl = FNMS(KP1_801937735, Tk, T1); T8 = FNMS(KP1_801937735, T7, T1); } } R1[WS(rs, 2)] = FMS(KP1_949855824, Ti, Tg); R0[WS(rs, 1)] = FMA(KP1_949855824, Ti, Tg); R0[WS(rs, 2)] = FNMS(KP1_949855824, Tn, Tl); R1[WS(rs, 1)] = -(FMA(KP1_949855824, Tn, Tl)); R0[WS(rs, 3)] = FNMS(KP1_949855824, Td, T8); R1[0] = -(FMA(KP1_949855824, Td, T8)); } }
static void r2cbIII_7(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP1_246979603, +1.246979603717467061050009768008479621264549462); DK(KP1_801937735, +1.801937735804838252472204639014890102331838324); DK(KP445041867, +0.445041867912628808577805128993589518932711138); DK(KP867767478, +0.867767478235116240951536665696717509219981456); DK(KP1_949855824, +1.949855824363647214036263365987862434465571601); DK(KP1_563662964, +1.563662964936059617416889053348115500464669037); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T9, Td, Tb, T1, T4, T2, T3, T5, Tc, Ta, T6, T8, T7; T6 = Ci[WS(csi, 2)]; T8 = Ci[0]; T7 = Ci[WS(csi, 1)]; T9 = FMA(KP1_563662964, T6, KP1_949855824 * T7) + (KP867767478 * T8); Td = FNMS(KP1_949855824, T8, KP1_563662964 * T7) - (KP867767478 * T6); Tb = FNMS(KP1_563662964, T8, KP1_949855824 * T6) - (KP867767478 * T7); T1 = Cr[WS(csr, 3)]; T4 = Cr[0]; T2 = Cr[WS(csr, 2)]; T3 = Cr[WS(csr, 1)]; T5 = FMA(KP445041867, T3, KP1_801937735 * T4) + FNMA(KP1_246979603, T2, T1); Tc = FMA(KP1_801937735, T2, KP445041867 * T4) + FNMA(KP1_246979603, T3, T1); Ta = FMA(KP1_246979603, T4, T1) + FNMA(KP1_801937735, T3, KP445041867 * T2); R1[0] = T5 - T9; R0[WS(rs, 3)] = -(T5 + T9); R0[WS(rs, 2)] = Td - Tc; R1[WS(rs, 1)] = Tc + Td; R1[WS(rs, 2)] = Tb - Ta; R0[WS(rs, 1)] = Ta + Tb; R0[0] = FMA(KP2_000000000, T2 + T3 + T4, T1); } }
static void e10_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP668178637, +0.668178637919298919997757686523080761552472251); DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); DK(KP198912367, +0.198912367379658006911597622644676228597850501); DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); DK(KP707106781, +0.707106781186547524400844362104849039284835938); DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); INT i; for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) { E T3, Te, Tl, Tp, Tm, T6, Tn, T9; { E T4, Tj, Tk, T5, T7, T8; { E T1, T2, Tc, Td; T1 = I[0]; T2 = I[WS(is, 7)]; Tc = I[WS(is, 4)]; Td = I[WS(is, 3)]; T4 = I[WS(is, 2)]; Tj = T1 + T2; T3 = T1 - T2; Tk = Tc + Td; Te = Tc - Td; T5 = I[WS(is, 5)]; T7 = I[WS(is, 1)]; T8 = I[WS(is, 6)]; } Tl = Tj - Tk; Tp = Tj + Tk; Tm = T4 + T5; T6 = T4 - T5; Tn = T7 + T8; T9 = T7 - T8; } { E Tg, Ti, Tb, Th; { E Tq, To, Ta, Tf; Tq = Tm + Tn; To = Tm - Tn; Ta = T6 + T9; Tf = T6 - T9; O[WS(os, 6)] = KP1_847759065 * (FMA(KP414213562, Tl, To)); O[WS(os, 2)] = KP1_847759065 * (FNMS(KP414213562, To, Tl)); O[0] = KP2_000000000 * (Tp + Tq); O[WS(os, 4)] = KP1_414213562 * (Tp - Tq); Tg = FNMS(KP707106781, Tf, Te); Ti = FMA(KP707106781, Tf, Te); Tb = FNMS(KP707106781, Ta, T3); Th = FMA(KP707106781, Ta, T3); } O[WS(os, 7)] = KP1_961570560 * (FMA(KP198912367, Th, Ti)); O[WS(os, 1)] = KP1_961570560 * (FNMS(KP198912367, Ti, Th)); O[WS(os, 5)] = -(KP1_662939224 * (FNMS(KP668178637, Tb, Tg))); O[WS(os, 3)] = KP1_662939224 * (FMA(KP668178637, Tg, Tb)); } } }
static void t1buv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP939692620, +0.939692620785908384054109277324731469936208134); DVK(KP907603734, +0.907603734547952313649323976213898122064543220); DVK(KP666666666, +0.666666666666666666666666666666666666666666667); DVK(KP852868531, +0.852868531952443209628250963940074071936020296); DVK(KP879385241, +0.879385241571816768108218554649462939872416269); DVK(KP984807753, +0.984807753012208059366743024589523013670643252); DVK(KP826351822, +0.826351822333069651148283373230685203999624323); DVK(KP347296355, +0.347296355333860697703433253538629592000751354); DVK(KP898197570, +0.898197570222573798468955502359086394667167570); DVK(KP673648177, +0.673648177666930348851716626769314796000375677); DVK(KP420276625, +0.420276625461206169731530603237061658838781920); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP586256827, +0.586256827714544512072145703099641959914944179); DVK(KP968908795, +0.968908795874236621082202410917456709164223497); DVK(KP726681596, +0.726681596905677465811651808188092531873167623); DVK(KP439692620, +0.439692620785908384054109277324731469936208134); DVK(KP203604859, +0.203604859554852403062088995281827210665664861); DVK(KP152703644, +0.152703644666139302296566746461370407999248646); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) { V T1, T3, T5, T9, Tn, Tb, Td, Th, Tj, Tx, T6; T1 = LD(&(x[0]), ms, &(x[0])); { V T2, T4, T8, Tm; T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0])); T8 = LD(&(x[WS(rs, 2)]), ms, &(x[0])); Tm = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); { V Ta, Tc, Tg, Ti; Ta = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)])); Tc = LD(&(x[WS(rs, 8)]), ms, &(x[0])); Tg = LD(&(x[WS(rs, 4)]), ms, &(x[0])); Ti = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)])); T3 = BYTW(&(W[TWVL * 4]), T2); T5 = BYTW(&(W[TWVL * 10]), T4); T9 = BYTW(&(W[TWVL * 2]), T8); Tn = BYTW(&(W[0]), Tm); Tb = BYTW(&(W[TWVL * 8]), Ta); Td = BYTW(&(W[TWVL * 14]), Tc); Th = BYTW(&(W[TWVL * 6]), Tg); Tj = BYTW(&(W[TWVL * 12]), Ti); } } Tx = VSUB(T3, T5); T6 = VADD(T3, T5); { V Tl, Te, Tk, To, T7, TN; Tl = VSUB(Td, Tb); Te = VADD(Tb, Td); Tk = VSUB(Th, Tj); To = VADD(Th, Tj); T7 = VFNMS(LDK(KP500000000), T6, T1); TN = VADD(T1, T6); { V Tf, TP, Tp, TO; Tf = VFNMS(LDK(KP500000000), Te, T9); TP = VADD(T9, Te); Tp = VFNMS(LDK(KP500000000), To, Tn); TO = VADD(Tn, To); { V Tz, TC, Tu, TD, TA, Tq, TQ, TS; Tz = VFNMS(LDK(KP152703644), Tl, Tf); TC = VFMA(LDK(KP203604859), Tf, Tl); Tu = VFNMS(LDK(KP439692620), Tk, Tf); TD = VFNMS(LDK(KP726681596), Tk, Tp); TA = VFMA(LDK(KP968908795), Tp, Tk); Tq = VFNMS(LDK(KP586256827), Tp, Tl); TQ = VADD(TO, TP); TS = VMUL(LDK(KP866025403), VSUB(TO, TP)); { V TI, TB, TH, TE, Tr, TR, Tw, Tv; Tv = VFNMS(LDK(KP420276625), Tu, Tl); TI = VFMA(LDK(KP673648177), TA, Tz); TB = VFNMS(LDK(KP673648177), TA, Tz); TH = VFNMS(LDK(KP898197570), TD, TC); TE = VFMA(LDK(KP898197570), TD, TC); Tr = VFNMS(LDK(KP347296355), Tq, Tk); ST(&(x[0]), VADD(TQ, TN), ms, &(x[0])); TR = VFNMS(LDK(KP500000000), TQ, TN); Tw = VFNMS(LDK(KP826351822), Tv, Tp); { V TM, TL, TF, TJ, Ts, Ty, TG, TK, Tt; TM = VMUL(LDK(KP984807753), VFMA(LDK(KP879385241), Tx, TI)); TL = VFMA(LDK(KP852868531), TE, T7); TF = VFNMS(LDK(KP500000000), TE, TB); TJ = VFMA(LDK(KP666666666), TI, TH); Ts = VFNMS(LDK(KP907603734), Tr, Tf); ST(&(x[WS(rs, 6)]), VFNMSI(TS, TR), ms, &(x[0])); ST(&(x[WS(rs, 3)]), VFMAI(TS, TR), ms, &(x[WS(rs, 1)])); Ty = VMUL(LDK(KP984807753), VFNMS(LDK(KP879385241), Tx, Tw)); ST(&(x[WS(rs, 8)]), VFNMSI(TM, TL), ms, &(x[0])); ST(&(x[WS(rs, 1)]), VFMAI(TM, TL), ms, &(x[WS(rs, 1)])); TG = VFMA(LDK(KP852868531), TF, T7); TK = VMUL(LDK(KP866025403), VFNMS(LDK(KP852868531), TJ, Tx)); Tt = VFNMS(LDK(KP939692620), Ts, T7); ST(&(x[WS(rs, 5)]), VFNMSI(TK, TG), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 4)]), VFMAI(TK, TG), ms, &(x[0])); ST(&(x[WS(rs, 2)]), VFMAI(Ty, Tt), ms, &(x[0])); ST(&(x[WS(rs, 7)]), VFNMSI(Ty, Tt), ms, &(x[WS(rs, 1)])); } } } } } } } VLEAVE(); }
static void r2cb_8(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E Th, Tb, Tg, Ti; { E T4, Ta, Td, T9, T3, Tc, T8, Te; T4 = Cr[WS(csr, 2)]; Ta = Ci[WS(csi, 2)]; { E T1, T2, T6, T7; T1 = Cr[0]; T2 = Cr[WS(csr, 4)]; T6 = Cr[WS(csr, 1)]; T7 = Cr[WS(csr, 3)]; Td = Ci[WS(csi, 1)]; T9 = T1 - T2; T3 = T1 + T2; Tc = T6 - T7; T8 = T6 + T7; Te = Ci[WS(csi, 3)]; } { E Tj, T5, Tk, Tf; Tj = FNMS(KP2_000000000, T4, T3); T5 = FMA(KP2_000000000, T4, T3); Th = FMA(KP2_000000000, Ta, T9); Tb = FNMS(KP2_000000000, Ta, T9); Tk = Td - Te; Tf = Td + Te; R0[0] = FMA(KP2_000000000, T8, T5); R0[WS(rs, 2)] = FNMS(KP2_000000000, T8, T5); R0[WS(rs, 3)] = FMA(KP2_000000000, Tk, Tj); R0[WS(rs, 1)] = FNMS(KP2_000000000, Tk, Tj); Tg = Tc - Tf; Ti = Tc + Tf; } } R1[0] = FMA(KP1_414213562, Tg, Tb); R1[WS(rs, 2)] = FNMS(KP1_414213562, Tg, Tb); R1[WS(rs, 3)] = FMA(KP1_414213562, Ti, Th); R1[WS(rs, 1)] = FNMS(KP1_414213562, Ti, Th); } }
static void hf_4(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms) { { INT m; for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 6, MAKE_VOLATILE_STRIDE(8, rs)) { E T1, Tp, T6, To, Tc, Tk, Th, Tl; T1 = cr[0]; Tp = ci[0]; { E T3, T5, T2, T4; T3 = cr[WS(rs, 2)]; T5 = ci[WS(rs, 2)]; T2 = W[2]; T4 = W[3]; T6 = FMA(T2, T3, T4 * T5); To = FNMS(T4, T3, T2 * T5); } { E T9, Tb, T8, Ta; T9 = cr[WS(rs, 1)]; Tb = ci[WS(rs, 1)]; T8 = W[0]; Ta = W[1]; Tc = FMA(T8, T9, Ta * Tb); Tk = FNMS(Ta, T9, T8 * Tb); } { E Te, Tg, Td, Tf; Te = cr[WS(rs, 3)]; Tg = ci[WS(rs, 3)]; Td = W[4]; Tf = W[5]; Th = FMA(Td, Te, Tf * Tg); Tl = FNMS(Tf, Te, Td * Tg); } { E T7, Ti, Tj, Tm; T7 = T1 + T6; Ti = Tc + Th; ci[WS(rs, 1)] = T7 - Ti; cr[0] = T7 + Ti; Tj = T1 - T6; Tm = Tk - Tl; ci[0] = Tj - Tm; cr[WS(rs, 1)] = Tj + Tm; } { E Tn, Tq, Tr, Ts; Tn = Tk + Tl; Tq = To + Tp; cr[WS(rs, 2)] = Tn - Tq; ci[WS(rs, 3)] = Tn + Tq; Tr = Th - Tc; Ts = Tp - To; cr[WS(rs, 3)] = Tr - Ts; ci[WS(rs, 2)] = Tr + Ts; } } } }
static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(rs)) { E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw; E T1i; { E T3, TC, Tk, TM, T6, Th, TF, TN; { E T1, T2, Ti, Tj; T1 = Rp[0]; T2 = Rm[WS(rs, 3)]; T3 = T1 + T2; TC = T1 - T2; Ti = Ip[0]; Tj = Im[WS(rs, 3)]; Tk = Ti + Tj; TM = Ti - Tj; } { E T4, T5, TD, TE; T4 = Rp[WS(rs, 2)]; T5 = Rm[WS(rs, 1)]; T6 = T4 + T5; Th = T4 - T5; TD = Ip[WS(rs, 2)]; TE = Im[WS(rs, 1)]; TF = TD + TE; TN = TD - TE; } T7 = T3 + T6; T1d = Tk - Th; T1h = TC + TF; Tl = Th + Tk; TG = TC - TF; T14 = T3 - T6; T19 = TM - TN; TO = TM + TN; } { E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK; { E T8, T9, Tn, To; T8 = Rp[WS(rs, 1)]; T9 = Rm[WS(rs, 2)]; Ta = T8 + T9; Tm = T8 - T9; Tn = Ip[WS(rs, 1)]; To = Im[WS(rs, 2)]; Tp = Tn + To; TJ = Tn - To; } { E Tb, Tc, Ts, Tt; Tb = Rm[0]; Tc = Rp[WS(rs, 3)]; Td = Tb + Tc; Tr = Tb - Tc; Ts = Im[0]; Tt = Ip[WS(rs, 3)]; Tu = Ts + Tt; TK = Tt - Ts; } Te = Ta + Td; TL = TJ + TK; T18 = Ta - Td; T15 = TK - TJ; { E Tz, TA, Tq, Tv; Tz = Tm - Tp; TA = Tr - Tu; TB = KP707106781 * (Tz + TA); T1e = KP707106781 * (Tz - TA); Tq = Tm + Tp; Tv = Tr + Tu; Tw = KP707106781 * (Tq - Tv); T1i = KP707106781 * (Tq + Tv); } } { E Tf, TP, TI, TQ; Tf = T7 + Te; TP = TL + TO; { E Tx, TH, Tg, Ty; Tx = Tl + Tw; TH = TB + TG; Tg = W[0]; Ty = W[1]; TI = FMA(Tg, Tx, Ty * TH); TQ = FNMS(Ty, Tx, Tg * TH); } Rp[0] = Tf - TI; Ip[0] = TP + TQ; Rm[0] = Tf + TI; Im[0] = TQ - TP; } { E T1r, T1x, T1w, T1y; { E T1o, T1q, T1n, T1p; T1o = T14 - T15; T1q = T19 - T18; T1n = W[10]; T1p = W[11]; T1r = FNMS(T1p, T1q, T1n * T1o); T1x = FMA(T1p, T1o, T1n * T1q); } { E T1t, T1v, T1s, T1u; T1t = T1d - T1e; T1v = T1i + T1h; T1s = W[12]; T1u = W[13]; T1w = FMA(T1s, T1t, T1u * T1v); T1y = FNMS(T1u, T1t, T1s * T1v); } Rp[WS(rs, 3)] = T1r - T1w; Ip[WS(rs, 3)] = T1x + T1y; Rm[WS(rs, 3)] = T1r + T1w; Im[WS(rs, 3)] = T1y - T1x; } { E TV, T11, T10, T12; { E TS, TU, TR, TT; TS = T7 - Te; TU = TO - TL; TR = W[6]; TT = W[7]; TV = FNMS(TT, TU, TR * TS); T11 = FMA(TT, TS, TR * TU); } { E TX, TZ, TW, TY; TX = Tl - Tw; TZ = TG - TB; TW = W[8]; TY = W[9]; T10 = FMA(TW, TX, TY * TZ); T12 = FNMS(TY, TX, TW * TZ); } Rp[WS(rs, 2)] = TV - T10; Ip[WS(rs, 2)] = T11 + T12; Rm[WS(rs, 2)] = TV + T10; Im[WS(rs, 2)] = T12 - T11; } { E T1b, T1l, T1k, T1m; { E T16, T1a, T13, T17; T16 = T14 + T15; T1a = T18 + T19; T13 = W[2]; T17 = W[3]; T1b = FNMS(T17, T1a, T13 * T16); T1l = FMA(T17, T16, T13 * T1a); } { E T1f, T1j, T1c, T1g; T1f = T1d + T1e; T1j = T1h - T1i; T1c = W[4]; T1g = W[5]; T1k = FMA(T1c, T1f, T1g * T1j); T1m = FNMS(T1g, T1f, T1c * T1j); } Rp[WS(rs, 1)] = T1b - T1k; Ip[WS(rs, 1)] = T1l + T1m; Rm[WS(rs, 1)] = T1b + T1k; Im[WS(rs, 1)] = T1m - T1l; } } }
static void r2cb_6(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP1_732050807, +1.732050807568877293527446341505872366942805254); { INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(24, rs), MAKE_VOLATILE_STRIDE(24, csr), MAKE_VOLATILE_STRIDE(24, csi)) { E T3, T7, Tc, Te, T6, T8, T1, T2, T9, Td; T1 = Cr[0]; T2 = Cr[WS(csr, 3)]; T3 = T1 - T2; T7 = T1 + T2; { E Ta, Tb, T4, T5; Ta = Ci[WS(csi, 2)]; Tb = Ci[WS(csi, 1)]; Tc = KP1_732050807 * (Ta - Tb); Te = KP1_732050807 * (Ta + Tb); T4 = Cr[WS(csr, 2)]; T5 = Cr[WS(csr, 1)]; T6 = T4 - T5; T8 = T4 + T5; } R1[WS(rs, 1)] = FMA(KP2_000000000, T6, T3); R0[0] = FMA(KP2_000000000, T8, T7); T9 = T7 - T8; R0[WS(rs, 2)] = T9 - Tc; R0[WS(rs, 1)] = T9 + Tc; Td = T3 - T6; R1[0] = Td - Te; R1[WS(rs, 2)] = Td + Te; } } }
static void hc2cbdft2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(rs)) { E T1m, T1r, T1i, T1u, T1o, T1v, T1n, T1w, T1s; { E T1k, Tl, T1p, TE, TP, T1g, TM, T1b, T1f, T1a, TU, Tf, T1l, TH, Tw; E T1q; { E TA, T3, TN, Tk, Th, T6, TO, TD, Tb, Tm, Ta, TK, Tp, Tc, Ts; E Tt; { E T4, T5, TB, TC; { E T1, T2, Ti, Tj; T1 = Rp[0]; T2 = Rm[WS(rs, 3)]; Ti = Ip[0]; Tj = Im[WS(rs, 3)]; T4 = Rp[WS(rs, 2)]; TA = T1 - T2; T3 = T1 + T2; TN = Ti - Tj; Tk = Ti + Tj; T5 = Rm[WS(rs, 1)]; TB = Ip[WS(rs, 2)]; TC = Im[WS(rs, 1)]; } { E T8, T9, Tn, To; T8 = Rp[WS(rs, 1)]; Th = T4 - T5; T6 = T4 + T5; TO = TB - TC; TD = TB + TC; T9 = Rm[WS(rs, 2)]; Tn = Ip[WS(rs, 1)]; To = Im[WS(rs, 2)]; Tb = Rm[0]; Tm = T8 - T9; Ta = T8 + T9; TK = Tn - To; Tp = Tn + To; Tc = Rp[WS(rs, 3)]; Ts = Im[0]; Tt = Ip[WS(rs, 3)]; } } { E Tr, Td, Tu, TL, Te, T7; T1k = Tk - Th; Tl = Th + Tk; Tr = Tb - Tc; Td = Tb + Tc; TL = Tt - Ts; Tu = Ts + Tt; T1p = TA + TD; TE = TA - TD; TP = TN + TO; T1g = TN - TO; TM = TK + TL; T1b = TL - TK; T1f = Ta - Td; Te = Ta + Td; T1a = T3 - T6; T7 = T3 + T6; { E Tq, TF, TG, Tv; Tq = Tm + Tp; TF = Tm - Tp; TG = Tr - Tu; Tv = Tr + Tu; TU = T7 - Te; Tf = T7 + Te; T1l = TF - TG; TH = TF + TG; Tw = Tq - Tv; T1q = Tq + Tv; } } } { E TX, T10, T1c, T13, T1h, T1E, T1H, T1C, T1K, T1G, T1L, T1F; { E TQ, Tx, T1y, TI, Tg, Tz; TX = TP - TM; TQ = TM + TP; Tx = FMA(KP707106781, Tw, Tl); T10 = FNMS(KP707106781, Tw, Tl); T1c = T1a + T1b; T1y = T1a - T1b; T13 = FNMS(KP707106781, TH, TE); TI = FMA(KP707106781, TH, TE); Tg = W[0]; Tz = W[1]; { E T1B, T1A, T1x, T1J, T1z, T1D; { E TR, Ty, TS, TJ; T1B = T1g - T1f; T1h = T1f + T1g; T1A = W[11]; TR = Tg * TI; Ty = Tg * Tx; T1x = W[10]; T1J = T1A * T1y; TS = FNMS(Tz, Tx, TR); TJ = FMA(Tz, TI, Ty); T1z = T1x * T1y; T1m = FMA(KP707106781, T1l, T1k); T1E = FNMS(KP707106781, T1l, T1k); Im[0] = TS - TQ; Ip[0] = TQ + TS; Rm[0] = Tf + TJ; Rp[0] = Tf - TJ; T1H = FMA(KP707106781, T1q, T1p); T1r = FNMS(KP707106781, T1q, T1p); T1D = W[12]; } T1C = FNMS(T1A, T1B, T1z); T1K = FMA(T1x, T1B, T1J); T1G = W[13]; T1L = T1D * T1H; T1F = T1D * T1E; } } { E TY, T16, T12, T17, T11; { E TW, TT, T15, TV, TZ, T1M, T1I; TW = W[7]; T1M = FNMS(T1G, T1E, T1L); T1I = FMA(T1G, T1H, T1F); TT = W[6]; T15 = TW * TU; Im[WS(rs, 3)] = T1M - T1K; Ip[WS(rs, 3)] = T1K + T1M; Rm[WS(rs, 3)] = T1C + T1I; Rp[WS(rs, 3)] = T1C - T1I; TV = TT * TU; TZ = W[8]; TY = FNMS(TW, TX, TV); T16 = FMA(TT, TX, T15); T12 = W[9]; T17 = TZ * T13; T11 = TZ * T10; } { E T1e, T19, T1t, T1d, T1j, T18, T14; T1e = W[3]; T18 = FNMS(T12, T10, T17); T14 = FMA(T12, T13, T11); T19 = W[2]; T1t = T1e * T1c; Im[WS(rs, 2)] = T18 - T16; Ip[WS(rs, 2)] = T16 + T18; Rm[WS(rs, 2)] = TY + T14; Rp[WS(rs, 2)] = TY - T14; T1d = T19 * T1c; T1j = W[4]; T1i = FNMS(T1e, T1h, T1d); T1u = FMA(T19, T1h, T1t); T1o = W[5]; T1v = T1j * T1r; T1n = T1j * T1m; } } } } T1w = FNMS(T1o, T1m, T1v); T1s = FMA(T1o, T1r, T1n); Im[WS(rs, 1)] = T1w - T1u; Ip[WS(rs, 1)] = T1u + T1w; Rm[WS(rs, 1)] = T1i + T1s; Rp[WS(rs, 1)] = T1i - T1s; } }
static const R *t2fv_8(R *ri, R *ii, const R *W, stride ios, INT m, INT dist) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; R *x; x = ri; for (i = m; i > 0; i = i - VL, x = x + (VL * dist), W = W + (TWVL * 14), MAKE_VOLATILE_STRIDE(ios)) { V T1, T2, Th, Tj, T5, T7, Ta, Tc; T1 = LD(&(x[0]), dist, &(x[0])); T2 = LD(&(x[WS(ios, 4)]), dist, &(x[0])); Th = LD(&(x[WS(ios, 2)]), dist, &(x[0])); Tj = LD(&(x[WS(ios, 6)]), dist, &(x[0])); T5 = LD(&(x[WS(ios, 1)]), dist, &(x[WS(ios, 1)])); T7 = LD(&(x[WS(ios, 5)]), dist, &(x[WS(ios, 1)])); Ta = LD(&(x[WS(ios, 7)]), dist, &(x[WS(ios, 1)])); Tc = LD(&(x[WS(ios, 3)]), dist, &(x[WS(ios, 1)])); { V T3, Ti, Tk, T6, T8, Tb, Td; T3 = BYTWJ(&(W[TWVL * 6]), T2); Ti = BYTWJ(&(W[TWVL * 2]), Th); Tk = BYTWJ(&(W[TWVL * 10]), Tj); T6 = BYTWJ(&(W[0]), T5); T8 = BYTWJ(&(W[TWVL * 8]), T7); Tb = BYTWJ(&(W[TWVL * 12]), Ta); Td = BYTWJ(&(W[TWVL * 4]), Tc); { V Tq, T4, Tr, Tl, Tt, T9, Tu, Te, Tw, Ts; Tq = VADD(T1, T3); T4 = VSUB(T1, T3); Tr = VADD(Ti, Tk); Tl = VSUB(Ti, Tk); Tt = VADD(T6, T8); T9 = VSUB(T6, T8); Tu = VADD(Tb, Td); Te = VSUB(Tb, Td); Tw = VSUB(Tq, Tr); Ts = VADD(Tq, Tr); { V Tx, Tv, Tm, Tf; Tx = VSUB(Tu, Tt); Tv = VADD(Tt, Tu); Tm = VSUB(Te, T9); Tf = VADD(T9, Te); { V Tp, Tn, To, Tg; ST(&(x[WS(ios, 2)]), VFMAI(Tx, Tw), dist, &(x[0])); ST(&(x[WS(ios, 6)]), VFNMSI(Tx, Tw), dist, &(x[0])); ST(&(x[0]), VADD(Ts, Tv), dist, &(x[0])); ST(&(x[WS(ios, 4)]), VSUB(Ts, Tv), dist, &(x[0])); Tp = VFMA(LDK(KP707106781), Tm, Tl); Tn = VFNMS(LDK(KP707106781), Tm, Tl); To = VFNMS(LDK(KP707106781), Tf, T4); Tg = VFMA(LDK(KP707106781), Tf, T4); ST(&(x[WS(ios, 5)]), VFNMSI(Tp, To), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 3)]), VFMAI(Tp, To), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 7)]), VFMAI(Tn, Tg), dist, &(x[WS(ios, 1)])); ST(&(x[WS(ios, 1)]), VFNMSI(Tn, Tg), dist, &(x[WS(ios, 1)])); } } } } } return W; }
static void t1bv_3(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP500000000, +0.500000000000000000000000000000000000000000000); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); { INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 4)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 4), MAKE_VOLATILE_STRIDE(3, rs)) { V T6, T2, T4, T7, T1, T3, T5, T8; T6 = LD(&(x[0]), ms, &(x[0])); T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T2 = BYTW(&(W[0]), T1); T3 = LD(&(x[WS(rs, 2)]), ms, &(x[0])); T4 = BYTW(&(W[TWVL * 2]), T3); T7 = VADD(T2, T4); ST(&(x[0]), VADD(T6, T7), ms, &(x[0])); T5 = VBYI(VMUL(LDK(KP866025403), VSUB(T2, T4))); T8 = VFNMS(LDK(KP500000000), T7, T6); ST(&(x[WS(rs, 1)]), VADD(T5, T8), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 2)]), VSUB(T8, T5), ms, &(x[0])); } } VLEAVE(); }
static void hc2cfdft2_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT m; for (m = mb, W = W + ((mb - 1) * 4); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 4, MAKE_VOLATILE_STRIDE(rs)) { E T1, T5, T2, T4; T1 = W[0]; T5 = W[3]; T2 = W[2]; T4 = W[1]; { E Tc, T6, Tp, Tj, Tw, Tt, T9, TE, To, TC, Ta, Tr, Tf, Tl, Tm; { E Th, Tb, T3, Ti; Th = Ip[0]; Tb = T1 * T5; T3 = T1 * T2; Ti = Im[0]; Tl = Rm[0]; Tc = FNMS(T4, T2, Tb); T6 = FMA(T4, T5, T3); Tp = Th + Ti; Tj = Th - Ti; Tm = Rp[0]; } { E T7, T8, Td, Tn, Te; T7 = Ip[WS(rs, 1)]; T8 = Im[WS(rs, 1)]; Td = Rp[WS(rs, 1)]; Tw = Tm + Tl; Tn = Tl - Tm; Tt = T7 + T8; T9 = T7 - T8; Te = Rm[WS(rs, 1)]; TE = T4 * Tn; To = T1 * Tn; TC = T2 * Tt; Ta = T6 * T9; Tr = Td - Te; Tf = Td + Te; } { E Tq, Tk, TB, Ty, Tu, TI, TG, TF; Tq = FNMS(T4, Tp, To); TF = FMA(T1, Tp, TE); { E Tg, Tx, TD, Ts; Tg = FNMS(Tc, Tf, Ta); Tx = T6 * Tf; TD = FNMS(T5, Tr, TC); Ts = T2 * Tr; Tk = Tg + Tj; TB = Tj - Tg; Ty = FMA(Tc, T9, Tx); Tu = FMA(T5, Tt, Ts); TI = TD + TF; TG = TD - TF; } { E Tz, TH, Tv, TA; Tz = Tw - Ty; TH = Tw + Ty; Tv = Tq - Tu; TA = Tu + Tq; Rp[0] = KP500000000 * (TH + TI); Rm[WS(rs, 1)] = KP500000000 * (TH - TI); Rm[0] = KP500000000 * (Tz - TA); Im[WS(rs, 1)] = KP500000000 * (Tv - Tk); Ip[0] = KP500000000 * (Tk + Tv); Im[0] = KP500000000 * (TG - TB); Rp[WS(rs, 1)] = KP500000000 * (Tz + TA); Ip[WS(rs, 1)] = KP500000000 * (TB + TG); } } } } } }
static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT m; for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) { E T9, Ta, T3, Tc, T7, T4; { E T1, T2, T5, T6; T1 = Ip[0]; T2 = Im[0]; T5 = Rm[0]; T6 = Rp[0]; T9 = W[1]; Ta = T1 + T2; T3 = T1 - T2; Tc = T6 + T5; T7 = T5 - T6; T4 = W[0]; } { E Td, T8, Te, Tb; Td = T9 * T7; T8 = T4 * T7; Te = FMA(T4, Ta, Td); Tb = FNMS(T9, Ta, T8); Rp[0] = KP500000000 * (Tc + Te); Rm[0] = KP500000000 * (Tc - Te); Im[0] = KP500000000 * (Tb - T3); Ip[0] = KP500000000 * (T3 + Tb); } } } }
static void r2cf_3(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP866025403, +0.866025403784438646763723170752936183471402627); DK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T1, T2, T3, T4; T1 = R0[0]; T2 = R1[0]; T3 = R0[WS(rs, 1)]; T4 = T2 + T3; Cr[WS(csr, 1)] = FNMS(KP500000000, T4, T1); Ci[WS(csi, 1)] = KP866025403 * (T3 - T2); Cr[0] = T1 + T4; } } }
static void hc2cfdft_2(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT m; for (m = mb, W = W + ((mb - 1) * 2); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 2, MAKE_VOLATILE_STRIDE(8, rs)) { E T3, T9, T7, Tb; { E T1, T2, T5, T6; T1 = Ip[0]; T2 = Im[0]; T3 = T1 - T2; T9 = T1 + T2; T5 = Rm[0]; T6 = Rp[0]; T7 = T5 - T6; Tb = T6 + T5; } { E Ta, Tc, T4, T8; T4 = W[0]; T8 = W[1]; Ta = FNMS(T8, T9, T4 * T7); Tc = FMA(T8, T7, T4 * T9); Ip[0] = KP500000000 * (T3 + Ta); Rp[0] = KP500000000 * (Tb + Tc); Im[0] = KP500000000 * (Ta - T3); Rm[0] = KP500000000 * (Tb - Tc); } } } }
static void hc2cfdftv_12(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); INT m; for (m = mb, W = W + ((mb - 1) * ((TWVL / VL) * 22)); m < me; m = m + VL, Rp = Rp + (VL * ms), Ip = Ip + (VL * ms), Rm = Rm - (VL * ms), Im = Im - (VL * ms), W = W + (TWVL * 22), MAKE_VOLATILE_STRIDE(rs)) { V T3, T7, TH, TE, Th, TC, Tq, T11, TU, Tx, Tb, Tz, Tu, Tw, Tp; V Tl, T9, Ta, T8, Ty, Tn, To, Tm, TG, T1, T2, Tt, T5, T6, T4; V Tv, Tj, Tk, Ti, TD, Tf, Tg, Te, TB, TT, TF, TR, Tr; T1 = LD(&(Rp[0]), ms, &(Rp[0])); T2 = LD(&(Rm[0]), -ms, &(Rm[0])); Tt = LDW(&(W[0])); T5 = LD(&(Rp[WS(rs, 2)]), ms, &(Rp[0])); T6 = LD(&(Rm[WS(rs, 2)]), -ms, &(Rm[0])); T4 = LDW(&(W[TWVL * 6])); Tv = LDW(&(W[TWVL * 8])); Tn = LD(&(Rp[WS(rs, 1)]), ms, &(Rp[WS(rs, 1)])); To = LD(&(Rm[WS(rs, 1)]), -ms, &(Rm[WS(rs, 1)])); T3 = VFMACONJ(T2, T1); Tu = VZMULIJ(Tt, VFNMSCONJ(T2, T1)); Tm = LDW(&(W[TWVL * 2])); TG = LDW(&(W[TWVL * 4])); T7 = VZMULJ(T4, VFMACONJ(T6, T5)); Tw = VZMULIJ(Tv, VFNMSCONJ(T6, T5)); Tj = LD(&(Rp[WS(rs, 5)]), ms, &(Rp[WS(rs, 1)])); Tk = LD(&(Rm[WS(rs, 5)]), -ms, &(Rm[WS(rs, 1)])); Ti = LDW(&(W[TWVL * 18])); TD = LDW(&(W[TWVL * 20])); Tp = VZMULJ(Tm, VFMACONJ(To, Tn)); TH = VZMULIJ(TG, VFNMSCONJ(To, Tn)); Tf = LD(&(Rp[WS(rs, 3)]), ms, &(Rp[WS(rs, 1)])); Tg = LD(&(Rm[WS(rs, 3)]), -ms, &(Rm[WS(rs, 1)])); Te = LDW(&(W[TWVL * 10])); TB = LDW(&(W[TWVL * 12])); Tl = VZMULJ(Ti, VFMACONJ(Tk, Tj)); TE = VZMULIJ(TD, VFNMSCONJ(Tk, Tj)); T9 = LD(&(Rp[WS(rs, 4)]), ms, &(Rp[0])); Ta = LD(&(Rm[WS(rs, 4)]), -ms, &(Rm[0])); T8 = LDW(&(W[TWVL * 14])); Ty = LDW(&(W[TWVL * 16])); Th = VZMULJ(Te, VFMACONJ(Tg, Tf)); TC = VZMULIJ(TB, VFNMSCONJ(Tg, Tf)); Tq = VADD(Tl, Tp); T11 = VSUB(Tp, Tl); TU = VSUB(Tu, Tw); Tx = VADD(Tu, Tw); Tb = VZMULJ(T8, VFMACONJ(Ta, T9)); Tz = VZMULIJ(Ty, VFNMSCONJ(Ta, T9)); TT = VSUB(TC, TE); TF = VADD(TC, TE); TR = VFNMS(LDK(KP500000000), Tq, Th); Tr = VADD(Th, Tq); { V TX, TA, T1d, TV, TY, TI, T1e, T12, TQ, Td, T10, Tc, T1a, TN, TJ; V T1j, T1f, T1b, TS, TM, Ts, T17, T13, TZ, T1i, T1c, T16, TW, TP, TO; V TL, TK, T1k, T1l, T1h, T1g, T18, T19, T15, T14; T10 = VSUB(Tb, T7); Tc = VADD(T7, Tb); TX = VFNMS(LDK(KP500000000), Tx, Tz); TA = VADD(Tx, Tz); T1d = VADD(TU, TT); TV = VSUB(TT, TU); TY = VFNMS(LDK(KP500000000), TF, TH); TI = VADD(TF, TH); T1e = VADD(T10, T11); T12 = VSUB(T10, T11); TQ = VFNMS(LDK(KP500000000), Tc, T3); Td = VADD(T3, Tc); T1a = VADD(TX, TY); TZ = VSUB(TX, TY); TN = VADD(TA, TI); TJ = VSUB(TA, TI); T1j = VMUL(LDK(KP866025403), VADD(T1d, T1e)); T1f = VMUL(LDK(KP866025403), VSUB(T1d, T1e)); T1b = VADD(TQ, TR); TS = VSUB(TQ, TR); TM = VADD(Td, Tr); Ts = VSUB(Td, Tr); T17 = VFMA(LDK(KP866025403), T12, TZ); T13 = VFNMS(LDK(KP866025403), T12, TZ); T1i = VSUB(T1b, T1a); T1c = VADD(T1a, T1b); T16 = VFNMS(LDK(KP866025403), TV, TS); TW = VFMA(LDK(KP866025403), TV, TS); TP = VCONJ(VMUL(LDK(KP500000000), VADD(TN, TM))); TO = VMUL(LDK(KP500000000), VSUB(TM, TN)); TL = VCONJ(VMUL(LDK(KP500000000), VFNMSI(TJ, Ts))); TK = VMUL(LDK(KP500000000), VFMAI(TJ, Ts)); T1k = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1j, T1i))); T1l = VMUL(LDK(KP500000000), VFMAI(T1j, T1i)); T1h = VMUL(LDK(KP500000000), VFMAI(T1f, T1c)); T1g = VCONJ(VMUL(LDK(KP500000000), VFNMSI(T1f, T1c))); T18 = VMUL(LDK(KP500000000), VFNMSI(T17, T16)); T19 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T17, T16))); T15 = VCONJ(VMUL(LDK(KP500000000), VFMAI(T13, TW))); T14 = VMUL(LDK(KP500000000), VFNMSI(T13, TW)); ST(&(Rm[WS(rs, 5)]), TP, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[0]), TO, ms, &(Rp[0])); ST(&(Rm[WS(rs, 2)]), TL, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 3)]), TK, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 3)]), T1k, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 4)]), T1l, ms, &(Rp[0])); ST(&(Rp[WS(rs, 2)]), T1h, ms, &(Rp[0])); ST(&(Rm[WS(rs, 1)]), T1g, -ms, &(Rm[WS(rs, 1)])); ST(&(Rp[WS(rs, 5)]), T18, ms, &(Rp[WS(rs, 1)])); ST(&(Rm[WS(rs, 4)]), T19, -ms, &(Rm[0])); ST(&(Rm[0]), T15, -ms, &(Rm[0])); ST(&(Rp[WS(rs, 1)]), T14, ms, &(Rp[WS(rs, 1)])); } } }
static void t1buv_9(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP939692620, +0.939692620785908384054109277324731469936208134); DVK(KP296198132, +0.296198132726023843175338011893050938967728390); DVK(KP852868531, +0.852868531952443209628250963940074071936020296); DVK(KP173648177, +0.173648177666930348851716626769314796000375677); DVK(KP556670399, +0.556670399226419366452912952047023132968291906); DVK(KP766044443, +0.766044443118978035202392650555416673935832457); DVK(KP642787609, +0.642787609686539326322643409907263432907559884); DVK(KP663413948, +0.663413948168938396205421319635891297216863310); DVK(KP150383733, +0.150383733180435296639271897612501926072238258); DVK(KP342020143, +0.342020143325668733044099614682259580763083368); DVK(KP813797681, +0.813797681349373692844693217248393223289101568); DVK(KP984807753, +0.984807753012208059366743024589523013670643252); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); { INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 16)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 16), MAKE_VOLATILE_STRIDE(9, rs)) { V T1, T6, Tu, Tg, Tf, TD, Tq, Tp, TE; T1 = LD(&(x[0]), ms, &(x[0])); { V T3, T5, T2, T4; T2 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T3 = BYTW(&(W[TWVL * 4]), T2); T4 = LD(&(x[WS(rs, 6)]), ms, &(x[0])); T5 = BYTW(&(W[TWVL * 10]), T4); T6 = VADD(T3, T5); Tu = VMUL(LDK(KP866025403), VSUB(T3, T5)); } { V T9, Td, Tb, T8, Tc, Ta, Te; T8 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T9 = BYTW(&(W[0]), T8); Tc = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)])); Td = BYTW(&(W[TWVL * 12]), Tc); Ta = LD(&(x[WS(rs, 4)]), ms, &(x[0])); Tb = BYTW(&(W[TWVL * 6]), Ta); Tg = VSUB(Tb, Td); Te = VADD(Tb, Td); Tf = VFNMS(LDK(KP500000000), Te, T9); TD = VADD(T9, Te); } { V Tj, Tn, Tl, Ti, Tm, Tk, To; Ti = LD(&(x[WS(rs, 2)]), ms, &(x[0])); Tj = BYTW(&(W[TWVL * 2]), Ti); Tm = LD(&(x[WS(rs, 8)]), ms, &(x[0])); Tn = BYTW(&(W[TWVL * 14]), Tm); Tk = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)])); Tl = BYTW(&(W[TWVL * 8]), Tk); Tq = VSUB(Tl, Tn); To = VADD(Tl, Tn); Tp = VFNMS(LDK(KP500000000), To, Tj); TE = VADD(Tj, To); } { V TF, TG, TH, TI; TF = VBYI(VMUL(LDK(KP866025403), VSUB(TD, TE))); TG = VADD(T1, T6); TH = VADD(TD, TE); TI = VFNMS(LDK(KP500000000), TH, TG); ST(&(x[WS(rs, 3)]), VADD(TF, TI), ms, &(x[WS(rs, 1)])); ST(&(x[0]), VADD(TG, TH), ms, &(x[0])); ST(&(x[WS(rs, 6)]), VSUB(TI, TF), ms, &(x[0])); } { V TC, Tv, Tw, Tx, Th, Tr, Ts, T7, TB; TC = VBYI(VSUB(VFMA(LDK(KP984807753), Tf, VFMA(LDK(KP813797681), Tq, VFNMS(LDK(KP150383733), Tg, VMUL(LDK(KP342020143), Tp)))), Tu)); Tv = VFMA(LDK(KP663413948), Tg, VMUL(LDK(KP642787609), Tf)); Tw = VFMA(LDK(KP150383733), Tq, VMUL(LDK(KP984807753), Tp)); Tx = VADD(Tv, Tw); Th = VFNMS(LDK(KP556670399), Tg, VMUL(LDK(KP766044443), Tf)); Tr = VFNMS(LDK(KP852868531), Tq, VMUL(LDK(KP173648177), Tp)); Ts = VADD(Th, Tr); T7 = VFNMS(LDK(KP500000000), T6, T1); TB = VFMA(LDK(KP852868531), Tg, VFMA(LDK(KP173648177), Tf, VFMA(LDK(KP296198132), Tq, VFNMS(LDK(KP939692620), Tp, T7)))); ST(&(x[WS(rs, 7)]), VSUB(TB, TC), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 2)]), VADD(TB, TC), ms, &(x[0])); { V Tt, Ty, Tz, TA; Tt = VADD(T7, Ts); Ty = VBYI(VADD(Tu, Tx)); ST(&(x[WS(rs, 8)]), VSUB(Tt, Ty), ms, &(x[0])); ST(&(x[WS(rs, 1)]), VADD(Tt, Ty), ms, &(x[WS(rs, 1)])); Tz = VBYI(VADD(Tu, VFNMS(LDK(KP500000000), Tx, VMUL(LDK(KP866025403), VSUB(Th, Tr))))); TA = VFMA(LDK(KP866025403), VSUB(Tw, Tv), VFNMS(LDK(KP500000000), Ts, T7)); ST(&(x[WS(rs, 4)]), VADD(Tz, TA), ms, &(x[0])); ST(&(x[WS(rs, 5)]), VSUB(TA, Tz), ms, &(x[WS(rs, 1)])); } } } } VLEAVE(); }
static const R *t1_10(R *ri, R *ii, const R *W, stride ios, INT m, INT dist) { DK(KP951056516, +0.951056516295153572116439333379382143405698634); DK(KP559016994, +0.559016994374947424102293417182819058860154590); DK(KP250000000, +0.250000000000000000000000000000000000000000000); DK(KP618033988, +0.618033988749894848204586834365638117720309180); INT i; for (i = m; i > 0; i = i - 1, ri = ri + dist, ii = ii + dist, W = W + 18, MAKE_VOLATILE_STRIDE(ios)) { E T1X, T21, T20, T22; { E T23, T1U, T8, T12, T1y, T25, T1P, T1H, T1Y, T18, T10, T2b, T1K, T1O, T15; E T1Z, T2a, Tz, T24, T1n; { E T1, T1T, T3, T6, T2, T5; T1 = ri[0]; T1T = ii[0]; T3 = ri[WS(ios, 5)]; T6 = ii[WS(ios, 5)]; T2 = W[8]; T5 = W[9]; { E T1w, TY, T1s, T1F, TM, T16, T1u, TS; { E TF, T1p, TO, TR, T1r, TL, TN, TQ, T1t, TP; { E TU, TX, TT, TW; { E TB, TE, T1R, T4, TA, TD; TB = ri[WS(ios, 4)]; TE = ii[WS(ios, 4)]; T1R = T2 * T6; T4 = T2 * T3; TA = W[6]; TD = W[7]; { E T1S, T7, T1o, TC; T1S = FNMS(T5, T3, T1R); T7 = FMA(T5, T6, T4); T1o = TA * TE; TC = TA * TB; T23 = T1T - T1S; T1U = T1S + T1T; T8 = T1 - T7; T12 = T1 + T7; TF = FMA(TD, TE, TC); T1p = FNMS(TD, TB, T1o); } } TU = ri[WS(ios, 1)]; TX = ii[WS(ios, 1)]; TT = W[0]; TW = W[1]; { E TH, TK, TJ, T1q, TI, T1v, TV, TG; TH = ri[WS(ios, 9)]; TK = ii[WS(ios, 9)]; T1v = TT * TX; TV = TT * TU; TG = W[16]; TJ = W[17]; T1w = FNMS(TW, TU, T1v); TY = FMA(TW, TX, TV); T1q = TG * TK; TI = TG * TH; TO = ri[WS(ios, 6)]; TR = ii[WS(ios, 6)]; T1r = FNMS(TJ, TH, T1q); TL = FMA(TJ, TK, TI); TN = W[10]; TQ = W[11]; } } T1s = T1p - T1r; T1F = T1p + T1r; TM = TF - TL; T16 = TF + TL; T1t = TN * TR; TP = TN * TO; T1u = FNMS(TQ, TO, T1t); TS = FMA(TQ, TR, TP); } { E T1e, Te, T1l, Tx, Tn, Tq, Tp, T1g, Tk, T1i, To; { E Tt, Tw, Tv, T1k, Tu; { E Ta, Td, T9, Tc, T1d, Tb, Ts; Ta = ri[WS(ios, 2)]; Td = ii[WS(ios, 2)]; { E T1G, T1x, TZ, T17; T1G = T1u + T1w; T1x = T1u - T1w; TZ = TS - TY; T17 = TS + TY; T1y = T1s - T1x; T25 = T1s + T1x; T1P = T1F + T1G; T1H = T1F - T1G; T1Y = T16 - T17; T18 = T16 + T17; T10 = TM + TZ; T2b = TM - TZ; T9 = W[2]; } Tc = W[3]; Tt = ri[WS(ios, 3)]; Tw = ii[WS(ios, 3)]; T1d = T9 * Td; Tb = T9 * Ta; Ts = W[4]; Tv = W[5]; T1e = FNMS(Tc, Ta, T1d); Te = FMA(Tc, Td, Tb); T1k = Ts * Tw; Tu = Ts * Tt; } { E Tg, Tj, Tf, Ti, T1f, Th, Tm; Tg = ri[WS(ios, 7)]; Tj = ii[WS(ios, 7)]; T1l = FNMS(Tv, Tt, T1k); Tx = FMA(Tv, Tw, Tu); Tf = W[12]; Ti = W[13]; Tn = ri[WS(ios, 8)]; Tq = ii[WS(ios, 8)]; T1f = Tf * Tj; Th = Tf * Tg; Tm = W[14]; Tp = W[15]; T1g = FNMS(Ti, Tg, T1f); Tk = FMA(Ti, Tj, Th); T1i = Tm * Tq; To = Tm * Tn; } } { E T1h, T1I, Tl, T13, T1j, Tr; T1h = T1e - T1g; T1I = T1e + T1g; Tl = Te - Tk; T13 = Te + Tk; T1j = FNMS(Tp, Tn, T1i); Tr = FMA(Tp, Tq, To); { E T1m, T1J, T14, Ty; T1m = T1j - T1l; T1J = T1j + T1l; T14 = Tr + Tx; Ty = Tr - Tx; T1K = T1I - T1J; T1O = T1I + T1J; T15 = T13 + T14; T1Z = T13 - T14; T2a = Tl - Ty; Tz = Tl + Ty; T24 = T1h + T1m; T1n = T1h - T1m; } } } } } { E T2c, T2e, T29, T2d; { E T1b, T11, T26, T28, T27; T1b = Tz - T10; T11 = Tz + T10; T26 = T24 + T25; T28 = T24 - T25; { E T1B, T1z, T1a, T1A, T1c; T1B = FNMS(KP618033988, T1n, T1y); T1z = FMA(KP618033988, T1y, T1n); ri[WS(ios, 5)] = T8 + T11; T1a = FNMS(KP250000000, T11, T8); T1A = FNMS(KP559016994, T1b, T1a); T1c = FMA(KP559016994, T1b, T1a); T27 = FNMS(KP250000000, T26, T23); T2c = FMA(KP618033988, T2b, T2a); T2e = FNMS(KP618033988, T2a, T2b); ri[WS(ios, 1)] = FMA(KP951056516, T1z, T1c); ri[WS(ios, 9)] = FNMS(KP951056516, T1z, T1c); ri[WS(ios, 3)] = FMA(KP951056516, T1B, T1A); ri[WS(ios, 7)] = FNMS(KP951056516, T1B, T1A); } ii[WS(ios, 5)] = T26 + T23; T29 = FMA(KP559016994, T28, T27); T2d = FNMS(KP559016994, T28, T27); } { E T1E, T1M, T1L, T1N, T19, T1D, T1C, T1Q, T1W, T1V; T19 = T15 + T18; T1D = T15 - T18; ii[WS(ios, 7)] = FMA(KP951056516, T2e, T2d); ii[WS(ios, 3)] = FNMS(KP951056516, T2e, T2d); ii[WS(ios, 9)] = FMA(KP951056516, T2c, T29); ii[WS(ios, 1)] = FNMS(KP951056516, T2c, T29); T1C = FNMS(KP250000000, T19, T12); ri[0] = T12 + T19; T1E = FNMS(KP559016994, T1D, T1C); T1M = FMA(KP559016994, T1D, T1C); T1L = FNMS(KP618033988, T1K, T1H); T1N = FMA(KP618033988, T1H, T1K); T1Q = T1O + T1P; T1W = T1O - T1P; ri[WS(ios, 6)] = FMA(KP951056516, T1N, T1M); ri[WS(ios, 4)] = FNMS(KP951056516, T1N, T1M); ri[WS(ios, 8)] = FMA(KP951056516, T1L, T1E); ri[WS(ios, 2)] = FNMS(KP951056516, T1L, T1E); T1V = FNMS(KP250000000, T1Q, T1U); ii[0] = T1Q + T1U; T1X = FNMS(KP559016994, T1W, T1V); T21 = FMA(KP559016994, T1W, T1V); T20 = FNMS(KP618033988, T1Z, T1Y); T22 = FMA(KP618033988, T1Y, T1Z); } } } ii[WS(ios, 6)] = FNMS(KP951056516, T22, T21); ii[WS(ios, 4)] = FMA(KP951056516, T22, T21); ii[WS(ios, 8)] = FNMS(KP951056516, T20, T1X); ii[WS(ios, 2)] = FMA(KP951056516, T20, T1X); } return W; }
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT i; for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) { E T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q; { E T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, T2h, T22, T1D; E T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, Tj, T11, Ti, T1V, TZ, Tk; E T12, T13; { E Tq, T1c, Tp, T20, T1a, Tr, T1d, T1e; { E T4, TL, T3, T1k, Ty, T5, Tz, TA; { E T1, T2, Tw, Tx; T1 = ri[0]; T2 = ri[WS(is, 8)]; Tw = ii[0]; Tx = ii[WS(is, 8)]; T4 = ri[WS(is, 4)]; TL = T1 - T2; T3 = T1 + T2; T1k = Tw - Tx; Ty = Tw + Tx; T5 = ri[WS(is, 12)]; Tz = ii[WS(is, 4)]; TA = ii[WS(is, 12)]; } { E Tn, To, T18, T19; Tn = ri[WS(is, 15)]; { E T1j, T6, TM, TB; T1j = T4 - T5; T6 = T4 + T5; TM = Tz - TA; TB = Tz + TA; T1l = T1j + T1k; T1H = T1k - T1j; T1R = T3 - T6; T7 = T3 + T6; T1x = TL + TM; TN = TL - TM; TC = Ty + TB; T25 = Ty - TB; To = ri[WS(is, 7)]; } T18 = ii[WS(is, 15)]; T19 = ii[WS(is, 7)]; Tq = ri[WS(is, 3)]; T1c = Tn - To; Tp = Tn + To; T20 = T18 + T19; T1a = T18 - T19; Tr = ri[WS(is, 11)]; T1d = ii[WS(is, 3)]; T1e = ii[WS(is, 11)]; } } { E Tb, TP, Ta, TO, TF, Tc, TG, TH; { E T8, T9, TD, TE; T8 = ri[WS(is, 2)]; { E T17, Ts, T21, T1f; T17 = Tq - Tr; Ts = Tq + Tr; T21 = T1d + T1e; T1f = T1d - T1e; T1E = T1a - T17; T1b = T17 + T1a; T1Z = Tp - Ts; Tt = Tp + Ts; T2h = T20 + T21; T22 = T20 - T21; T1D = T1c + T1f; T1g = T1c - T1f; T9 = ri[WS(is, 10)]; } TD = ii[WS(is, 2)]; TE = ii[WS(is, 10)]; Tb = ri[WS(is, 14)]; TP = T8 - T9; Ta = T8 + T9; TO = TD - TE; TF = TD + TE; Tc = ri[WS(is, 6)]; TG = ii[WS(is, 14)]; TH = ii[WS(is, 6)]; } { E TR, Td, TS, TI; T1n = TP + TO; TQ = TO - TP; TR = Tb - Tc; Td = Tb + Tc; TS = TG - TH; TI = TG + TH; Te = Ta + Td; T26 = Td - Ta; TT = TR + TS; T1m = TR - TS; TJ = TF + TI; T1S = TF - TI; } } { E Tg, Th, TX, TY; Tg = ri[WS(is, 1)]; Th = ri[WS(is, 9)]; TX = ii[WS(is, 1)]; TY = ii[WS(is, 9)]; Tj = ri[WS(is, 5)]; T11 = Tg - Th; Ti = Tg + Th; T1V = TX + TY; TZ = TX - TY; Tk = ri[WS(is, 13)]; T12 = ii[WS(is, 5)]; T13 = ii[WS(is, 13)]; } } { E T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i; { E Tf, Tu, T2j, T2k, T2g; T2f = T7 - Te; Tf = T7 + Te; { E TW, Tl, T1W, T14, Tm; TW = Tj - Tk; Tl = Tj + Tk; T1W = T12 + T13; T14 = T12 - T13; T1B = TZ - TW; T10 = TW + TZ; T1U = Ti - Tl; Tm = Ti + Tl; T2g = T1V + T1W; T1X = T1V - T1W; T1A = T11 + T14; T15 = T11 - T14; Tu = Tm + Tt; Tv = Tt - Tm; } TK = TC - TJ; T2j = TC + TJ; T2k = T2g + T2h; T2i = T2g - T2h; ro[0] = Tf + Tu; ro[WS(os, 8)] = Tf - Tu; io[0] = T2j + T2k; io[WS(os, 8)] = T2j - T2k; } { E T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, T23; T29 = T1R - T1S; T1T = T1R + T1S; io[WS(os, 12)] = TK - Tv; io[WS(os, 4)] = Tv + TK; ro[WS(os, 4)] = T2f + T2i; ro[WS(os, 12)] = T2f - T2i; T27 = T25 - T26; T2d = T26 + T25; T2a = T1X - T1U; T1Y = T1U + T1X; T23 = T1Z - T22; T2b = T1Z + T22; T28 = T23 - T1Y; T24 = T1Y + T23; { E T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q, T1i; { E T1o, T2e, T2c, TU, T16, T1h; T1I = TQ + TT; TU = TQ - TT; io[WS(os, 14)] = FNMS(KP707106781, T28, T27); io[WS(os, 6)] = FMA(KP707106781, T28, T27); ro[WS(os, 2)] = FMA(KP707106781, T24, T1T); ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T); T2e = T2a + T2b; T2c = T2a - T2b; TV = FMA(KP707106781, TU, TN); T1v = FNMS(KP707106781, TU, TN); io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d); io[WS(os, 2)] = FMA(KP707106781, T2e, T2d); ro[WS(os, 6)] = FMA(KP707106781, T2c, T29); ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29); T1o = T1m - T1n; T1y = T1n + T1m; T1t = FNMS(KP414213562, T10, T15); T16 = FMA(KP414213562, T15, T10); T1h = FNMS(KP414213562, T1g, T1b); T1s = FMA(KP414213562, T1b, T1g); T1r = FMA(KP707106781, T1o, T1l); T1p = FNMS(KP707106781, T1o, T1l); T1q = T16 + T1h; T1i = T16 - T1h; } { E T1w, T1u, T1C, T1F; io[WS(os, 15)] = FMA(KP923879532, T1q, T1p); io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p); ro[WS(os, 3)] = FMA(KP923879532, T1i, TV); ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV); T1w = T1t + T1s; T1u = T1s - T1t; T1z = FMA(KP707106781, T1y, T1x); T1L = FNMS(KP707106781, T1y, T1x); ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v); ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v); io[WS(os, 3)] = FMA(KP923879532, T1u, T1r); io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r); T1M = FNMS(KP414213562, T1A, T1B); T1C = FMA(KP414213562, T1B, T1A); T1F = FNMS(KP414213562, T1E, T1D); T1N = FMA(KP414213562, T1D, T1E); T1P = FMA(KP707106781, T1I, T1H); T1J = FNMS(KP707106781, T1I, T1H); T1K = T1F - T1C; T1G = T1C + T1F; } } } } } io[WS(os, 5)] = FMA(KP923879532, T1K, T1J); io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J); ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z); ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z); T1O = T1M - T1N; T1Q = T1M + T1N; io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P); io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P); ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L); ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L); } } }