static void e10_8(const R *I, R *O, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP765366864, +0.765366864730179543456919968060797733522689125); DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); DK(KP390180644, +0.390180644032256535696569736954044481855383236); DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP1_111140466, +1.111140466039204449485661627897065748749874382); DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = v; i > 0; i = i - 1, I = I + ivs, O = O + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) { E T3, Tj, Tf, Tk, Ta, Tn, Tc, Tm; { E T1, T2, Td, Te; T1 = I[0]; T2 = I[WS(is, 7)]; T3 = T1 - T2; Tj = T1 + T2; Td = I[WS(is, 4)]; Te = I[WS(is, 3)]; Tf = Td - Te; Tk = Td + Te; { E T4, T5, T6, T7, T8, T9; T4 = I[WS(is, 2)]; T5 = I[WS(is, 5)]; T6 = T4 - T5; T7 = I[WS(is, 1)]; T8 = I[WS(is, 6)]; T9 = T7 - T8; Ta = KP707106781 * (T6 + T9); Tn = T7 + T8; Tc = KP707106781 * (T6 - T9); Tm = T4 + T5; } } { E Tb, Tg, Tp, Tq; Tb = T3 - Ta; Tg = Tc - Tf; O[WS(os, 3)] = FNMS(KP1_111140466, Tg, KP1_662939224 * Tb); O[WS(os, 5)] = FMA(KP1_662939224, Tg, KP1_111140466 * Tb); Tp = Tj + Tk; Tq = Tm + Tn; O[WS(os, 4)] = KP1_414213562 * (Tp - Tq); O[0] = KP2_000000000 * (Tp + Tq); } { E Th, Ti, Tl, To; Th = T3 + Ta; Ti = Tf + Tc; O[WS(os, 1)] = FNMS(KP390180644, Ti, KP1_961570560 * Th); O[WS(os, 7)] = FMA(KP1_961570560, Ti, KP390180644 * Th); Tl = Tj - Tk; To = Tm - Tn; O[WS(os, 2)] = FNMS(KP765366864, To, KP1_847759065 * Tl); O[WS(os, 6)] = FMA(KP765366864, Tl, KP1_847759065 * To); } } }
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT i; for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) { E T1z, T1L, T1M, T1N, T1P, T1J, T1K, T1G, T1O, T1Q; { E T1l, T1H, T1R, T7, T1x, TN, TC, T25, T1E, T1b, T1Z, Tt, T2h, T22, T1D; E T1g, T1n, TQ, Te, T26, TT, T1m, TJ, T1S, Tj, T11, Ti, T1V, TZ, Tk; E T12, T13; { E Tq, T1c, Tp, T20, T1a, Tr, T1d, T1e; { E T4, TL, T3, T1k, Ty, T5, Tz, TA; { E T1, T2, Tw, Tx; T1 = ri[0]; T2 = ri[WS(is, 8)]; Tw = ii[0]; Tx = ii[WS(is, 8)]; T4 = ri[WS(is, 4)]; TL = T1 - T2; T3 = T1 + T2; T1k = Tw - Tx; Ty = Tw + Tx; T5 = ri[WS(is, 12)]; Tz = ii[WS(is, 4)]; TA = ii[WS(is, 12)]; } { E Tn, To, T18, T19; Tn = ri[WS(is, 15)]; { E T1j, T6, TM, TB; T1j = T4 - T5; T6 = T4 + T5; TM = Tz - TA; TB = Tz + TA; T1l = T1j + T1k; T1H = T1k - T1j; T1R = T3 - T6; T7 = T3 + T6; T1x = TL + TM; TN = TL - TM; TC = Ty + TB; T25 = Ty - TB; To = ri[WS(is, 7)]; } T18 = ii[WS(is, 15)]; T19 = ii[WS(is, 7)]; Tq = ri[WS(is, 3)]; T1c = Tn - To; Tp = Tn + To; T20 = T18 + T19; T1a = T18 - T19; Tr = ri[WS(is, 11)]; T1d = ii[WS(is, 3)]; T1e = ii[WS(is, 11)]; } } { E Tb, TP, Ta, TO, TF, Tc, TG, TH; { E T8, T9, TD, TE; T8 = ri[WS(is, 2)]; { E T17, Ts, T21, T1f; T17 = Tq - Tr; Ts = Tq + Tr; T21 = T1d + T1e; T1f = T1d - T1e; T1E = T1a - T17; T1b = T17 + T1a; T1Z = Tp - Ts; Tt = Tp + Ts; T2h = T20 + T21; T22 = T20 - T21; T1D = T1c + T1f; T1g = T1c - T1f; T9 = ri[WS(is, 10)]; } TD = ii[WS(is, 2)]; TE = ii[WS(is, 10)]; Tb = ri[WS(is, 14)]; TP = T8 - T9; Ta = T8 + T9; TO = TD - TE; TF = TD + TE; Tc = ri[WS(is, 6)]; TG = ii[WS(is, 14)]; TH = ii[WS(is, 6)]; } { E TR, Td, TS, TI; T1n = TP + TO; TQ = TO - TP; TR = Tb - Tc; Td = Tb + Tc; TS = TG - TH; TI = TG + TH; Te = Ta + Td; T26 = Td - Ta; TT = TR + TS; T1m = TR - TS; TJ = TF + TI; T1S = TF - TI; } } { E Tg, Th, TX, TY; Tg = ri[WS(is, 1)]; Th = ri[WS(is, 9)]; TX = ii[WS(is, 1)]; TY = ii[WS(is, 9)]; Tj = ri[WS(is, 5)]; T11 = Tg - Th; Ti = Tg + Th; T1V = TX + TY; TZ = TX - TY; Tk = ri[WS(is, 13)]; T12 = ii[WS(is, 5)]; T13 = ii[WS(is, 13)]; } } { E T2f, T1B, T10, T1U, T1X, T1A, T15, Tv, TK, T2i; { E Tf, Tu, T2j, T2k, T2g; T2f = T7 - Te; Tf = T7 + Te; { E TW, Tl, T1W, T14, Tm; TW = Tj - Tk; Tl = Tj + Tk; T1W = T12 + T13; T14 = T12 - T13; T1B = TZ - TW; T10 = TW + TZ; T1U = Ti - Tl; Tm = Ti + Tl; T2g = T1V + T1W; T1X = T1V - T1W; T1A = T11 + T14; T15 = T11 - T14; Tu = Tm + Tt; Tv = Tt - Tm; } TK = TC - TJ; T2j = TC + TJ; T2k = T2g + T2h; T2i = T2g - T2h; ro[0] = Tf + Tu; ro[WS(os, 8)] = Tf - Tu; io[0] = T2j + T2k; io[WS(os, 8)] = T2j - T2k; } { E T29, T1T, T27, T2d, T2a, T2b, T28, T24, T1Y, T23; T29 = T1R - T1S; T1T = T1R + T1S; io[WS(os, 12)] = TK - Tv; io[WS(os, 4)] = Tv + TK; ro[WS(os, 4)] = T2f + T2i; ro[WS(os, 12)] = T2f - T2i; T27 = T25 - T26; T2d = T26 + T25; T2a = T1X - T1U; T1Y = T1U + T1X; T23 = T1Z - T22; T2b = T1Z + T22; T28 = T23 - T1Y; T24 = T1Y + T23; { E T1I, TV, T1v, T1y, T1t, T1s, T1r, T1p, T1q, T1i; { E T1o, T2e, T2c, TU, T16, T1h; T1I = TQ + TT; TU = TQ - TT; io[WS(os, 14)] = FNMS(KP707106781, T28, T27); io[WS(os, 6)] = FMA(KP707106781, T28, T27); ro[WS(os, 2)] = FMA(KP707106781, T24, T1T); ro[WS(os, 10)] = FNMS(KP707106781, T24, T1T); T2e = T2a + T2b; T2c = T2a - T2b; TV = FMA(KP707106781, TU, TN); T1v = FNMS(KP707106781, TU, TN); io[WS(os, 10)] = FNMS(KP707106781, T2e, T2d); io[WS(os, 2)] = FMA(KP707106781, T2e, T2d); ro[WS(os, 6)] = FMA(KP707106781, T2c, T29); ro[WS(os, 14)] = FNMS(KP707106781, T2c, T29); T1o = T1m - T1n; T1y = T1n + T1m; T1t = FNMS(KP414213562, T10, T15); T16 = FMA(KP414213562, T15, T10); T1h = FNMS(KP414213562, T1g, T1b); T1s = FMA(KP414213562, T1b, T1g); T1r = FMA(KP707106781, T1o, T1l); T1p = FNMS(KP707106781, T1o, T1l); T1q = T16 + T1h; T1i = T16 - T1h; } { E T1w, T1u, T1C, T1F; io[WS(os, 15)] = FMA(KP923879532, T1q, T1p); io[WS(os, 7)] = FNMS(KP923879532, T1q, T1p); ro[WS(os, 3)] = FMA(KP923879532, T1i, TV); ro[WS(os, 11)] = FNMS(KP923879532, T1i, TV); T1w = T1t + T1s; T1u = T1s - T1t; T1z = FMA(KP707106781, T1y, T1x); T1L = FNMS(KP707106781, T1y, T1x); ro[WS(os, 15)] = FMA(KP923879532, T1w, T1v); ro[WS(os, 7)] = FNMS(KP923879532, T1w, T1v); io[WS(os, 3)] = FMA(KP923879532, T1u, T1r); io[WS(os, 11)] = FNMS(KP923879532, T1u, T1r); T1M = FNMS(KP414213562, T1A, T1B); T1C = FMA(KP414213562, T1B, T1A); T1F = FNMS(KP414213562, T1E, T1D); T1N = FMA(KP414213562, T1D, T1E); T1P = FMA(KP707106781, T1I, T1H); T1J = FNMS(KP707106781, T1I, T1H); T1K = T1F - T1C; T1G = T1C + T1F; } } } } } io[WS(os, 5)] = FMA(KP923879532, T1K, T1J); io[WS(os, 13)] = FNMS(KP923879532, T1K, T1J); ro[WS(os, 1)] = FMA(KP923879532, T1G, T1z); ro[WS(os, 9)] = FNMS(KP923879532, T1G, T1z); T1O = T1M - T1N; T1Q = T1M + T1N; io[WS(os, 1)] = FMA(KP923879532, T1Q, T1P); io[WS(os, 9)] = FNMS(KP923879532, T1Q, T1P); ro[WS(os, 5)] = FMA(KP923879532, T1O, T1L); ro[WS(os, 13)] = FNMS(KP923879532, T1O, T1L); } } }
static const R *t1_7(R *ri, R *ii, const R *W, stride ios, int m, int dist) { DK(KP222520933, +0.222520933956314404288902564496794759466355569); DK(KP900968867, +0.900968867902419126236102319507445051165919162); DK(KP623489801, +0.623489801858733530525004884004239810632274731); DK(KP433883739, +0.433883739117558120475768332848358754609990728); DK(KP781831482, +0.781831482468029808708444526674057750232334519); DK(KP974927912, +0.974927912181823607018131682993931217232785801); int i; for (i = m; i > 0; i = i - 1, ri = ri + dist, ii = ii + dist, W = W + 12) { E T1, TR, Tc, TS, TC, TO, Tn, TT, TI, TP, Ty, TU, TF, TQ; T1 = ri[0]; TR = ii[0]; { E T6, TA, Tb, TB; { E T3, T5, T2, T4; T3 = ri[WS(ios, 1)]; T5 = ii[WS(ios, 1)]; T2 = W[0]; T4 = W[1]; T6 = FMA(T2, T3, T4 * T5); TA = FNMS(T4, T3, T2 * T5); } { E T8, Ta, T7, T9; T8 = ri[WS(ios, 6)]; Ta = ii[WS(ios, 6)]; T7 = W[10]; T9 = W[11]; Tb = FMA(T7, T8, T9 * Ta); TB = FNMS(T9, T8, T7 * Ta); } Tc = T6 + Tb; TS = Tb - T6; TC = TA - TB; TO = TA + TB; } { E Th, TG, Tm, TH; { E Te, Tg, Td, Tf; Te = ri[WS(ios, 2)]; Tg = ii[WS(ios, 2)]; Td = W[2]; Tf = W[3]; Th = FMA(Td, Te, Tf * Tg); TG = FNMS(Tf, Te, Td * Tg); } { E Tj, Tl, Ti, Tk; Tj = ri[WS(ios, 5)]; Tl = ii[WS(ios, 5)]; Ti = W[8]; Tk = W[9]; Tm = FMA(Ti, Tj, Tk * Tl); TH = FNMS(Tk, Tj, Ti * Tl); } Tn = Th + Tm; TT = Tm - Th; TI = TG - TH; TP = TG + TH; } { E Ts, TD, Tx, TE; { E Tp, Tr, To, Tq; Tp = ri[WS(ios, 3)]; Tr = ii[WS(ios, 3)]; To = W[4]; Tq = W[5]; Ts = FMA(To, Tp, Tq * Tr); TD = FNMS(Tq, Tp, To * Tr); } { E Tu, Tw, Tt, Tv; Tu = ri[WS(ios, 4)]; Tw = ii[WS(ios, 4)]; Tt = W[6]; Tv = W[7]; Tx = FMA(Tt, Tu, Tv * Tw); TE = FNMS(Tv, Tu, Tt * Tw); } Ty = Ts + Tx; TU = Tx - Ts; TF = TD - TE; TQ = TD + TE; } ri[0] = T1 + Tc + Tn + Ty; ii[0] = TO + TP + TQ + TR; { E TJ, Tz, TX, TY; TJ = FNMS(KP781831482, TF, KP974927912 * TC) - (KP433883739 * TI); Tz = FMA(KP623489801, Ty, T1) + FNMA(KP900968867, Tn, KP222520933 * Tc); ri[WS(ios, 5)] = Tz - TJ; ri[WS(ios, 2)] = Tz + TJ; TX = FNMS(KP781831482, TU, KP974927912 * TS) - (KP433883739 * TT); TY = FMA(KP623489801, TQ, TR) + FNMA(KP900968867, TP, KP222520933 * TO); ii[WS(ios, 2)] = TX + TY; ii[WS(ios, 5)] = TY - TX; } { E TL, TK, TV, TW; TL = FMA(KP781831482, TC, KP974927912 * TI) + (KP433883739 * TF); TK = FMA(KP623489801, Tc, T1) + FNMA(KP900968867, Ty, KP222520933 * Tn); ri[WS(ios, 6)] = TK - TL; ri[WS(ios, 1)] = TK + TL; TV = FMA(KP781831482, TS, KP974927912 * TT) + (KP433883739 * TU); TW = FMA(KP623489801, TO, TR) + FNMA(KP900968867, TQ, KP222520933 * TP); ii[WS(ios, 1)] = TV + TW; ii[WS(ios, 6)] = TW - TV; } { E TN, TM, TZ, T10; TN = FMA(KP433883739, TC, KP974927912 * TF) - (KP781831482 * TI); TM = FMA(KP623489801, Tn, T1) + FNMA(KP222520933, Ty, KP900968867 * Tc); ri[WS(ios, 4)] = TM - TN; ri[WS(ios, 3)] = TM + TN; TZ = FMA(KP433883739, TS, KP974927912 * TU) - (KP781831482 * TT); T10 = FMA(KP623489801, TP, TR) + FNMA(KP222520933, TQ, KP900968867 * TO); ii[WS(ios, 3)] = TZ + T10; ii[WS(ios, 4)] = T10 - TZ; } } return W; }
static void n1_16(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP382683432, +0.382683432365089771728459984030398866761344562); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT i; for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(64, is), MAKE_VOLATILE_STRIDE(64, os)) { E T7, T1R, T25, TC, TN, T1x, T1H, T1l, Tt, T22, T2h, T1b, T1g, T1E, T1Z; E T1D, Te, T1S, T26, TJ, TQ, T1m, T1n, TT, Tm, T1X, T2g, T10, T15, T1B; E T1U, T1A; { E T3, TL, Ty, T1k, T6, T1j, TB, TM; { E T1, T2, Tw, Tx; T1 = ri[0]; T2 = ri[WS(is, 8)]; T3 = T1 + T2; TL = T1 - T2; Tw = ii[0]; Tx = ii[WS(is, 8)]; Ty = Tw + Tx; T1k = Tw - Tx; } { E T4, T5, Tz, TA; T4 = ri[WS(is, 4)]; T5 = ri[WS(is, 12)]; T6 = T4 + T5; T1j = T4 - T5; Tz = ii[WS(is, 4)]; TA = ii[WS(is, 12)]; TB = Tz + TA; TM = Tz - TA; } T7 = T3 + T6; T1R = T3 - T6; T25 = Ty - TB; TC = Ty + TB; TN = TL - TM; T1x = TL + TM; T1H = T1k - T1j; T1l = T1j + T1k; } { E Tp, T17, T1f, T20, Ts, T1c, T1a, T21; { E Tn, To, T1d, T1e; Tn = ri[WS(is, 15)]; To = ri[WS(is, 7)]; Tp = Tn + To; T17 = Tn - To; T1d = ii[WS(is, 15)]; T1e = ii[WS(is, 7)]; T1f = T1d - T1e; T20 = T1d + T1e; } { E Tq, Tr, T18, T19; Tq = ri[WS(is, 3)]; Tr = ri[WS(is, 11)]; Ts = Tq + Tr; T1c = Tq - Tr; T18 = ii[WS(is, 3)]; T19 = ii[WS(is, 11)]; T1a = T18 - T19; T21 = T18 + T19; } Tt = Tp + Ts; T22 = T20 - T21; T2h = T20 + T21; T1b = T17 - T1a; T1g = T1c + T1f; T1E = T1f - T1c; T1Z = Tp - Ts; T1D = T17 + T1a; } { E Ta, TP, TF, TO, Td, TR, TI, TS; { E T8, T9, TD, TE; T8 = ri[WS(is, 2)]; T9 = ri[WS(is, 10)]; Ta = T8 + T9; TP = T8 - T9; TD = ii[WS(is, 2)]; TE = ii[WS(is, 10)]; TF = TD + TE; TO = TD - TE; } { E Tb, Tc, TG, TH; Tb = ri[WS(is, 14)]; Tc = ri[WS(is, 6)]; Td = Tb + Tc; TR = Tb - Tc; TG = ii[WS(is, 14)]; TH = ii[WS(is, 6)]; TI = TG + TH; TS = TG - TH; } Te = Ta + Td; T1S = TF - TI; T26 = Td - Ta; TJ = TF + TI; TQ = TO - TP; T1m = TR - TS; T1n = TP + TO; TT = TR + TS; } { E Ti, T11, TZ, T1V, Tl, TW, T14, T1W; { E Tg, Th, TX, TY; Tg = ri[WS(is, 1)]; Th = ri[WS(is, 9)]; Ti = Tg + Th; T11 = Tg - Th; TX = ii[WS(is, 1)]; TY = ii[WS(is, 9)]; TZ = TX - TY; T1V = TX + TY; } { E Tj, Tk, T12, T13; Tj = ri[WS(is, 5)]; Tk = ri[WS(is, 13)]; Tl = Tj + Tk; TW = Tj - Tk; T12 = ii[WS(is, 5)]; T13 = ii[WS(is, 13)]; T14 = T12 - T13; T1W = T12 + T13; } Tm = Ti + Tl; T1X = T1V - T1W; T2g = T1V + T1W; T10 = TW + TZ; T15 = T11 - T14; T1B = T11 + T14; T1U = Ti - Tl; T1A = TZ - TW; } { E Tf, Tu, T2j, T2k; Tf = T7 + Te; Tu = Tm + Tt; ro[WS(os, 8)] = Tf - Tu; ro[0] = Tf + Tu; T2j = TC + TJ; T2k = T2g + T2h; io[WS(os, 8)] = T2j - T2k; io[0] = T2j + T2k; } { E Tv, TK, T2f, T2i; Tv = Tt - Tm; TK = TC - TJ; io[WS(os, 4)] = Tv + TK; io[WS(os, 12)] = TK - Tv; T2f = T7 - Te; T2i = T2g - T2h; ro[WS(os, 12)] = T2f - T2i; ro[WS(os, 4)] = T2f + T2i; } { E T1T, T27, T24, T28, T1Y, T23; T1T = T1R + T1S; T27 = T25 - T26; T1Y = T1U + T1X; T23 = T1Z - T22; T24 = KP707106781 * (T1Y + T23); T28 = KP707106781 * (T23 - T1Y); ro[WS(os, 10)] = T1T - T24; io[WS(os, 6)] = T27 + T28; ro[WS(os, 2)] = T1T + T24; io[WS(os, 14)] = T27 - T28; } { E T29, T2d, T2c, T2e, T2a, T2b; T29 = T1R - T1S; T2d = T26 + T25; T2a = T1X - T1U; T2b = T1Z + T22; T2c = KP707106781 * (T2a - T2b); T2e = KP707106781 * (T2a + T2b); ro[WS(os, 14)] = T29 - T2c; io[WS(os, 2)] = T2d + T2e; ro[WS(os, 6)] = T29 + T2c; io[WS(os, 10)] = T2d - T2e; } { E TV, T1r, T1p, T1v, T1i, T1q, T1u, T1w, TU, T1o; TU = KP707106781 * (TQ - TT); TV = TN + TU; T1r = TN - TU; T1o = KP707106781 * (T1m - T1n); T1p = T1l - T1o; T1v = T1l + T1o; { E T16, T1h, T1s, T1t; T16 = FMA(KP923879532, T10, KP382683432 * T15); T1h = FNMS(KP923879532, T1g, KP382683432 * T1b); T1i = T16 + T1h; T1q = T1h - T16; T1s = FNMS(KP923879532, T15, KP382683432 * T10); T1t = FMA(KP382683432, T1g, KP923879532 * T1b); T1u = T1s - T1t; T1w = T1s + T1t; } ro[WS(os, 11)] = TV - T1i; io[WS(os, 11)] = T1v - T1w; ro[WS(os, 3)] = TV + T1i; io[WS(os, 3)] = T1v + T1w; io[WS(os, 15)] = T1p - T1q; ro[WS(os, 15)] = T1r - T1u; io[WS(os, 7)] = T1p + T1q; ro[WS(os, 7)] = T1r + T1u; } { E T1z, T1L, T1J, T1P, T1G, T1K, T1O, T1Q, T1y, T1I; T1y = KP707106781 * (T1n + T1m); T1z = T1x + T1y; T1L = T1x - T1y; T1I = KP707106781 * (TQ + TT); T1J = T1H - T1I; T1P = T1H + T1I; { E T1C, T1F, T1M, T1N; T1C = FMA(KP382683432, T1A, KP923879532 * T1B); T1F = FNMS(KP382683432, T1E, KP923879532 * T1D); T1G = T1C + T1F; T1K = T1F - T1C; T1M = FNMS(KP382683432, T1B, KP923879532 * T1A); T1N = FMA(KP923879532, T1E, KP382683432 * T1D); T1O = T1M - T1N; T1Q = T1M + T1N; } ro[WS(os, 9)] = T1z - T1G; io[WS(os, 9)] = T1P - T1Q; ro[WS(os, 1)] = T1z + T1G; io[WS(os, 1)] = T1P + T1Q; io[WS(os, 13)] = T1J - T1K; ro[WS(os, 13)] = T1L - T1O; io[WS(os, 5)] = T1J + T1K; ro[WS(os, 5)] = T1L + T1O; } } } }
static void mhc2r_32_0(const R *ri, const R *ii, R *O, stride ris, stride iis, stride os) { DK(KP1_662939224, +1.662939224605090474157576755235811513477121624); DK(KP1_111140466, +1.111140466039204449485661627897065748749874382); DK(KP1_961570560, +1.961570560806460898252364472268478073947867462); DK(KP390180644, +0.390180644032256535696569736954044481855383236); DK(KP765366864, +0.765366864730179543456919968060797733522689125); DK(KP1_847759065, +1.847759065022573512256366378793576573644833252); DK(KP707106781, +0.707106781186547524400844362104849039284835938); DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); { E T9, T2c, TB, T1y, T6, T2b, Ty, T1v, Th, T2e, T2f, TD, TK, T1C, T1F; E T1h, Tp, T2i, T2m, TN, T13, T1K, T1Y, T1k, Tw, TU, T1l, TW, T1V, T2j; E T1R, T2l; { E T7, T8, T1w, Tz, TA, T1x; T7 = ri[WS(ris, 4)]; T8 = ri[WS(ris, 12)]; T1w = T7 - T8; Tz = ii[WS(iis, 4)]; TA = ii[WS(iis, 12)]; T1x = Tz + TA; T9 = KP2_000000000 * (T7 + T8); T2c = KP1_414213562 * (T1w + T1x); TB = KP2_000000000 * (Tz - TA); T1y = KP1_414213562 * (T1w - T1x); } { E T5, T1u, T3, T1s; { E T4, T1t, T1, T2; T4 = ri[WS(ris, 8)]; T5 = KP2_000000000 * T4; T1t = ii[WS(iis, 8)]; T1u = KP2_000000000 * T1t; T1 = ri[0]; T2 = ri[WS(ris, 16)]; T3 = T1 + T2; T1s = T1 - T2; } T6 = T3 + T5; T2b = T1s + T1u; Ty = T3 - T5; T1v = T1s - T1u; } { E Td, T1A, TG, T1E, Tg, T1D, TJ, T1B; { E Tb, Tc, TE, TF; Tb = ri[WS(ris, 2)]; Tc = ri[WS(ris, 14)]; Td = Tb + Tc; T1A = Tb - Tc; TE = ii[WS(iis, 2)]; TF = ii[WS(iis, 14)]; TG = TE - TF; T1E = TE + TF; } { E Te, Tf, TH, TI; Te = ri[WS(ris, 10)]; Tf = ri[WS(ris, 6)]; Tg = Te + Tf; T1D = Te - Tf; TH = ii[WS(iis, 10)]; TI = ii[WS(iis, 6)]; TJ = TH - TI; T1B = TH + TI; } Th = KP2_000000000 * (Td + Tg); T2e = T1A + T1B; T2f = T1E - T1D; TD = Td - Tg; TK = TG - TJ; T1C = T1A - T1B; T1F = T1D + T1E; T1h = KP2_000000000 * (TJ + TG); } { E Tl, T1I, TZ, T1X, To, T1W, T12, T1J; { E Tj, Tk, TX, TY; Tj = ri[WS(ris, 1)]; Tk = ri[WS(ris, 15)]; Tl = Tj + Tk; T1I = Tj - Tk; TX = ii[WS(iis, 1)]; TY = ii[WS(iis, 15)]; TZ = TX - TY; T1X = TX + TY; } { E Tm, Tn, T10, T11; Tm = ri[WS(ris, 9)]; Tn = ri[WS(ris, 7)]; To = Tm + Tn; T1W = Tm - Tn; T10 = ii[WS(iis, 9)]; T11 = ii[WS(iis, 7)]; T12 = T10 - T11; T1J = T10 + T11; } Tp = Tl + To; T2i = T1I + T1J; T2m = T1X - T1W; TN = Tl - To; T13 = TZ - T12; T1K = T1I - T1J; T1Y = T1W + T1X; T1k = T12 + TZ; } { E Ts, T1L, TT, T1M, Tv, T1O, TQ, T1P; { E Tq, Tr, TR, TS; Tq = ri[WS(ris, 5)]; Tr = ri[WS(ris, 11)]; Ts = Tq + Tr; T1L = Tq - Tr; TR = ii[WS(iis, 5)]; TS = ii[WS(iis, 11)]; TT = TR - TS; T1M = TR + TS; } { E Tt, Tu, TO, TP; Tt = ri[WS(ris, 3)]; Tu = ri[WS(ris, 13)]; Tv = Tt + Tu; T1O = Tt - Tu; TO = ii[WS(iis, 13)]; TP = ii[WS(iis, 3)]; TQ = TO - TP; T1P = TP + TO; } Tw = Ts + Tv; TU = TQ - TT; T1l = TT + TQ; TW = Ts - Tv; { E T1T, T1U, T1N, T1Q; T1T = T1L + T1M; T1U = T1O + T1P; T1V = KP707106781 * (T1T - T1U); T2j = KP707106781 * (T1T + T1U); T1N = T1L - T1M; T1Q = T1O - T1P; T1R = KP707106781 * (T1N + T1Q); T2l = KP707106781 * (T1N - T1Q); } } { E Tx, T1r, Ti, T1q, Ta; Tx = KP2_000000000 * (Tp + Tw); T1r = KP2_000000000 * (T1l + T1k); Ta = T6 + T9; Ti = Ta + Th; T1q = Ta - Th; O[WS(os, 16)] = Ti - Tx; O[WS(os, 24)] = T1q + T1r; O[0] = Ti + Tx; O[WS(os, 8)] = T1q - T1r; } { E T1i, T1o, T1n, T1p, T1g, T1j, T1m; T1g = T6 - T9; T1i = T1g - T1h; T1o = T1g + T1h; T1j = Tp - Tw; T1m = T1k - T1l; T1n = KP1_414213562 * (T1j - T1m); T1p = KP1_414213562 * (T1j + T1m); O[WS(os, 20)] = T1i - T1n; O[WS(os, 28)] = T1o + T1p; O[WS(os, 4)] = T1i + T1n; O[WS(os, 12)] = T1o - T1p; } { E TM, T16, T15, T17; { E TC, TL, TV, T14; TC = Ty - TB; TL = KP1_414213562 * (TD - TK); TM = TC + TL; T16 = TC - TL; TV = TN + TU; T14 = TW + T13; T15 = FNMS(KP765366864, T14, KP1_847759065 * TV); T17 = FMA(KP765366864, TV, KP1_847759065 * T14); } O[WS(os, 18)] = TM - T15; O[WS(os, 26)] = T16 + T17; O[WS(os, 2)] = TM + T15; O[WS(os, 10)] = T16 - T17; } { E T2t, T2x, T2w, T2y; { E T2r, T2s, T2u, T2v; T2r = T2b + T2c; T2s = FMA(KP1_847759065, T2e, KP765366864 * T2f); T2t = T2r - T2s; T2x = T2r + T2s; T2u = T2i + T2j; T2v = T2m - T2l; T2w = FNMS(KP1_961570560, T2v, KP390180644 * T2u); T2y = FMA(KP1_961570560, T2u, KP390180644 * T2v); } O[WS(os, 23)] = T2t - T2w; O[WS(os, 31)] = T2x + T2y; O[WS(os, 7)] = T2t + T2w; O[WS(os, 15)] = T2x - T2y; } { E T1a, T1e, T1d, T1f; { E T18, T19, T1b, T1c; T18 = Ty + TB; T19 = KP1_414213562 * (TD + TK); T1a = T18 - T19; T1e = T18 + T19; T1b = TN - TU; T1c = T13 - TW; T1d = FNMS(KP1_847759065, T1c, KP765366864 * T1b); T1f = FMA(KP1_847759065, T1b, KP765366864 * T1c); } O[WS(os, 22)] = T1a - T1d; O[WS(os, 30)] = T1e + T1f; O[WS(os, 6)] = T1a + T1d; O[WS(os, 14)] = T1e - T1f; } { E T25, T29, T28, T2a; { E T23, T24, T26, T27; T23 = T1v - T1y; T24 = FMA(KP765366864, T1C, KP1_847759065 * T1F); T25 = T23 - T24; T29 = T23 + T24; T26 = T1K - T1R; T27 = T1Y - T1V; T28 = FNMS(KP1_662939224, T27, KP1_111140466 * T26); T2a = FMA(KP1_662939224, T26, KP1_111140466 * T27); } O[WS(os, 21)] = T25 - T28; O[WS(os, 29)] = T29 + T2a; O[WS(os, 5)] = T25 + T28; O[WS(os, 13)] = T29 - T2a; } { E T2h, T2p, T2o, T2q; { E T2d, T2g, T2k, T2n; T2d = T2b - T2c; T2g = FNMS(KP1_847759065, T2f, KP765366864 * T2e); T2h = T2d + T2g; T2p = T2d - T2g; T2k = T2i - T2j; T2n = T2l + T2m; T2o = FNMS(KP1_111140466, T2n, KP1_662939224 * T2k); T2q = FMA(KP1_111140466, T2k, KP1_662939224 * T2n); } O[WS(os, 19)] = T2h - T2o; O[WS(os, 27)] = T2p + T2q; O[WS(os, 3)] = T2h + T2o; O[WS(os, 11)] = T2p - T2q; } { E T1H, T21, T20, T22; { E T1z, T1G, T1S, T1Z; T1z = T1v + T1y; T1G = FNMS(KP765366864, T1F, KP1_847759065 * T1C); T1H = T1z + T1G; T21 = T1z - T1G; T1S = T1K + T1R; T1Z = T1V + T1Y; T20 = FNMS(KP390180644, T1Z, KP1_961570560 * T1S); T22 = FMA(KP390180644, T1S, KP1_961570560 * T1Z); } O[WS(os, 17)] = T1H - T20; O[WS(os, 25)] = T21 + T22; O[WS(os, 1)] = T1H + T20; O[WS(os, 9)] = T21 - T22; } } }