static void r2cf_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP773010453, +0.773010453362736960810906609758469800971041293); DK(KP995184726, +0.995184726672196886244836953109479921575474869); DK(KP098491403, +0.098491403357164253077197521291327432293052451); DK(KP820678790, +0.820678790828660330972281985331011598767386482); DK(KP956940335, +0.956940335732208864935797886980269969482849206); DK(KP881921264, +0.881921264348355029712756863660388349508442621); DK(KP534511135, +0.534511135950791641089685961295362908582039528); DK(KP303346683, +0.303346683607342391675883946941299872384187453); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP198912367, +0.198912367379658006911597622644676228597850501); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP668178637, +0.668178637919298919997757686523080761552472251); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP414213562, +0.414213562373095048801688724209698078569671875); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T5n, T5o; { E T11, T2j, T4P, T5P, T3D, T5p, T3d, Tf, T1k, T1H, T5D, T4l, T5A, T4a, T3i; E T2U, T1R, T2e, T5K, T4G, T5H, T4v, T3l, T31, T5s, T42, T5t, T3Z, T2n, T1b; E T3f, TZ, T5v, T3T, T5w, T3Q, T2m, T18, T3e, TK, T3K, T5Q, T4S, T5q, T14; E T2k, T3p, Tu, T4w, T1U, T5E, T4h, T5B, T4o, T3j, T2X, T1I, T1z, T1Z, T4A; E T24, T4x, T1X, T20; { E TN, T3V, TS, TX, T3X, TQ, T40, TT; { E T1g, T46, T1B, T1G, T47, T1j, T4j, T1C; { E T4, T3z, T3, T3B, Td, T5, T8, T9; { E T1, T2, Tb, Tc; T1 = R0[0]; T2 = R0[WS(rs, 16)]; Tb = R0[WS(rs, 28)]; Tc = R0[WS(rs, 12)]; T4 = R0[WS(rs, 8)]; T3z = T1 - T2; T3 = T1 + T2; T3B = Tb - Tc; Td = Tb + Tc; T5 = R0[WS(rs, 24)]; T8 = R0[WS(rs, 4)]; T9 = R0[WS(rs, 20)]; } { E T1E, T1F, T1h, T1i; { E T1e, T4N, T6, T3A, Ta, T1f; T1e = R1[0]; T4N = T4 - T5; T6 = T4 + T5; T3A = T8 - T9; Ta = T8 + T9; T1f = R1[WS(rs, 16)]; { E T7, T3C, T4O, Te; T11 = T3 - T6; T7 = T3 + T6; T3C = T3A + T3B; T4O = T3B - T3A; T2j = Td - Ta; Te = Ta + Td; T4P = FNMS(KP707106781, T4O, T4N); T5P = FMA(KP707106781, T4O, T4N); T3D = FMA(KP707106781, T3C, T3z); T5p = FNMS(KP707106781, T3C, T3z); T3d = T7 - Te; Tf = T7 + Te; T1g = T1e + T1f; T46 = T1e - T1f; } } T1E = R1[WS(rs, 4)]; T1F = R1[WS(rs, 20)]; T1h = R1[WS(rs, 8)]; T1i = R1[WS(rs, 24)]; T1B = R1[WS(rs, 28)]; T1G = T1E + T1F; T47 = T1E - T1F; T1j = T1h + T1i; T4j = T1h - T1i; T1C = R1[WS(rs, 12)]; } } { E T1N, T4r, T28, T2d, T4s, T1Q, T4E, T29; { E T2b, T2c, T1O, T1P; { E T2S, T48, T1D, T1L, T1M, T4k, T49, T2T; T1L = R1[WS(rs, 31)]; T1M = R1[WS(rs, 15)]; T2S = T1g + T1j; T1k = T1g - T1j; T48 = T1B - T1C; T1D = T1B + T1C; T1N = T1L + T1M; T4r = T1L - T1M; T4k = T47 - T48; T49 = T47 + T48; T2T = T1G + T1D; T1H = T1D - T1G; T5D = FNMS(KP707106781, T4k, T4j); T4l = FMA(KP707106781, T4k, T4j); T5A = FNMS(KP707106781, T49, T46); T4a = FMA(KP707106781, T49, T46); T3i = T2S - T2T; T2U = T2S + T2T; T2b = R1[WS(rs, 3)]; T2c = R1[WS(rs, 19)]; } T1O = R1[WS(rs, 7)]; T1P = R1[WS(rs, 23)]; T28 = R1[WS(rs, 27)]; T2d = T2b + T2c; T4s = T2b - T2c; T1Q = T1O + T1P; T4E = T1P - T1O; T29 = R1[WS(rs, 11)]; } { E TV, TW, TO, TP; { E T2Z, T4t, T2a, TL, TM, T4F, T4u, T30; TL = R0[WS(rs, 31)]; TM = R0[WS(rs, 15)]; T2Z = T1N + T1Q; T1R = T1N - T1Q; T4t = T28 - T29; T2a = T28 + T29; TN = TL + TM; T3V = TL - TM; T4F = T4t - T4s; T4u = T4s + T4t; T30 = T2d + T2a; T2e = T2a - T2d; T5K = FNMS(KP707106781, T4F, T4E); T4G = FMA(KP707106781, T4F, T4E); T5H = FNMS(KP707106781, T4u, T4r); T4v = FMA(KP707106781, T4u, T4r); T3l = T2Z - T30; T31 = T2Z + T30; TV = R0[WS(rs, 27)]; TW = R0[WS(rs, 11)]; } TO = R0[WS(rs, 7)]; TP = R0[WS(rs, 23)]; TS = R0[WS(rs, 3)]; TX = TV + TW; T3X = TV - TW; TQ = TO + TP; T40 = TO - TP; TT = R0[WS(rs, 19)]; } } } { E Ti, T3E, Tn, Ts, T3I, Tl, T3F, To; { E Ty, T3M, TD, TI, T3O, TB, T3R, TE; { E TG, TH, Tz, TA; { E T19, TR, T3W, TU, Tw, Tx; Tw = R0[WS(rs, 1)]; Tx = R0[WS(rs, 17)]; T19 = TN - TQ; TR = TN + TQ; T3W = TS - TT; TU = TS + TT; Ty = Tw + Tx; T3M = Tw - Tx; { E T41, T3Y, T1a, TY; T41 = T3W - T3X; T3Y = T3W + T3X; T1a = TX - TU; TY = TU + TX; T5s = FNMS(KP707106781, T41, T40); T42 = FMA(KP707106781, T41, T40); T5t = FNMS(KP707106781, T3Y, T3V); T3Z = FMA(KP707106781, T3Y, T3V); T2n = FMA(KP414213562, T19, T1a); T1b = FNMS(KP414213562, T1a, T19); T3f = TR - TY; TZ = TR + TY; TG = R0[WS(rs, 29)]; TH = R0[WS(rs, 13)]; } } Tz = R0[WS(rs, 9)]; TA = R0[WS(rs, 25)]; TD = R0[WS(rs, 5)]; TI = TG + TH; T3O = TG - TH; TB = Tz + TA; T3R = Tz - TA; TE = R0[WS(rs, 21)]; } { E Tq, Tr, Tj, Tk; { E T16, TC, T3N, TF, Tg, Th; Tg = R0[WS(rs, 2)]; Th = R0[WS(rs, 18)]; T16 = Ty - TB; TC = Ty + TB; T3N = TD - TE; TF = TD + TE; Ti = Tg + Th; T3E = Tg - Th; { E T3S, T3P, T17, TJ; T3S = T3N - T3O; T3P = T3N + T3O; T17 = TI - TF; TJ = TF + TI; T5v = FNMS(KP707106781, T3S, T3R); T3T = FMA(KP707106781, T3S, T3R); T5w = FNMS(KP707106781, T3P, T3M); T3Q = FMA(KP707106781, T3P, T3M); T2m = FNMS(KP414213562, T16, T17); T18 = FMA(KP414213562, T17, T16); T3e = TC - TJ; TK = TC + TJ; Tq = R0[WS(rs, 6)]; Tr = R0[WS(rs, 22)]; } } Tj = R0[WS(rs, 10)]; Tk = R0[WS(rs, 26)]; Tn = R0[WS(rs, 30)]; Ts = Tq + Tr; T3I = Tq - Tr; Tl = Tj + Tk; T3F = Tj - Tk; To = R0[WS(rs, 14)]; } } { E T1n, T4b, T1s, T4f, T1x, T4c, T1q, T1t; { E T1v, T1w, T1o, T1p; { E T1l, T4Q, T3G, Tm, T12, Tp, T3H, T1m; T1l = R1[WS(rs, 2)]; T4Q = FMA(KP414213562, T3E, T3F); T3G = FNMS(KP414213562, T3F, T3E); Tm = Ti + Tl; T12 = Ti - Tl; Tp = Tn + To; T3H = Tn - To; T1m = R1[WS(rs, 18)]; T1v = R1[WS(rs, 6)]; { E T4R, T3J, Tt, T13; T4R = FNMS(KP414213562, T3H, T3I); T3J = FMA(KP414213562, T3I, T3H); Tt = Tp + Ts; T13 = Tp - Ts; T1n = T1l + T1m; T4b = T1l - T1m; T3K = T3G + T3J; T5Q = T3J - T3G; T4S = T4Q + T4R; T5q = T4Q - T4R; T14 = T12 + T13; T2k = T13 - T12; T3p = Tt - Tm; Tu = Tm + Tt; T1w = R1[WS(rs, 22)]; } } T1o = R1[WS(rs, 10)]; T1p = R1[WS(rs, 26)]; T1s = R1[WS(rs, 30)]; T4f = T1v - T1w; T1x = T1v + T1w; T4c = T1o - T1p; T1q = T1o + T1p; T1t = R1[WS(rs, 14)]; } { E T22, T23, T1V, T1W; { E T1S, T4d, T4m, T2V, T1r, T4e, T1u, T1T; T1S = R1[WS(rs, 1)]; T4d = FNMS(KP414213562, T4c, T4b); T4m = FMA(KP414213562, T4b, T4c); T2V = T1n + T1q; T1r = T1n - T1q; T4e = T1s - T1t; T1u = T1s + T1t; T1T = R1[WS(rs, 17)]; T22 = R1[WS(rs, 5)]; { E T4g, T4n, T2W, T1y; T4g = FMA(KP414213562, T4f, T4e); T4n = FNMS(KP414213562, T4e, T4f); T2W = T1u + T1x; T1y = T1u - T1x; T4w = T1S - T1T; T1U = T1S + T1T; T5E = T4g - T4d; T4h = T4d + T4g; T5B = T4m - T4n; T4o = T4m + T4n; T3j = T2W - T2V; T2X = T2V + T2W; T1I = T1y - T1r; T1z = T1r + T1y; T23 = R1[WS(rs, 21)]; } } T1V = R1[WS(rs, 9)]; T1W = R1[WS(rs, 25)]; T1Z = R1[WS(rs, 29)]; T4A = T23 - T22; T24 = T22 + T23; T4x = T1W - T1V; T1X = T1V + T1W; T20 = R1[WS(rs, 13)]; } } } } { E T4C, T5L, T4J, T5I, T26, T2f, T3q, T3h, T3w, T3s, T3o, T3r, T3t; { E T2R, T37, T2Y, T3a, T39, T3m, T3b, T35, Tv, T10, T34, T3c, T3x, T3y; { E T4y, T4H, T32, T1Y, T4z, T21; T2R = Tf - Tu; Tv = Tf + Tu; T4y = FMA(KP414213562, T4x, T4w); T4H = FNMS(KP414213562, T4w, T4x); T32 = T1U + T1X; T1Y = T1U - T1X; T4z = T1Z - T20; T21 = T1Z + T20; T10 = TK + TZ; T37 = TZ - TK; T2Y = T2U - T2X; T3a = T2U + T2X; { E T4B, T4I, T33, T25; T4B = FNMS(KP414213562, T4A, T4z); T4I = FMA(KP414213562, T4z, T4A); T33 = T21 + T24; T25 = T21 - T24; T39 = Tv + T10; T4C = T4y + T4B; T5L = T4B - T4y; T4J = T4H + T4I; T5I = T4I - T4H; T34 = T32 + T33; T3m = T33 - T32; T26 = T1Y + T25; T2f = T25 - T1Y; } } Cr[WS(csr, 16)] = Tv - T10; T3b = T31 + T34; T35 = T31 - T34; Ci[WS(csi, 16)] = T3b - T3a; T3c = T3a + T3b; { E T3k, T3u, T3v, T3n, T36, T38, T3g; T3g = T3e + T3f; T3q = T3f - T3e; Cr[0] = T39 + T3c; Cr[WS(csr, 32)] = T39 - T3c; T36 = T2Y + T35; T38 = T35 - T2Y; T3x = FNMS(KP707106781, T3g, T3d); T3h = FMA(KP707106781, T3g, T3d); Ci[WS(csi, 8)] = FMA(KP707106781, T38, T37); Ci[WS(csi, 24)] = FMS(KP707106781, T38, T37); Cr[WS(csr, 8)] = FMA(KP707106781, T36, T2R); Cr[WS(csr, 24)] = FNMS(KP707106781, T36, T2R); T3k = FMA(KP414213562, T3j, T3i); T3u = FNMS(KP414213562, T3i, T3j); T3v = FMA(KP414213562, T3l, T3m); T3n = FNMS(KP414213562, T3m, T3l); T3y = T3v - T3u; T3w = T3u + T3v; T3s = T3n - T3k; T3o = T3k + T3n; } Cr[WS(csr, 12)] = FMA(KP923879532, T3y, T3x); Cr[WS(csr, 20)] = FNMS(KP923879532, T3y, T3x); } Cr[WS(csr, 4)] = FMA(KP923879532, T3o, T3h); Cr[WS(csr, 28)] = FNMS(KP923879532, T3o, T3h); T3r = FNMS(KP707106781, T3q, T3p); T3t = FMA(KP707106781, T3q, T3p); { E T27, T2g, T2v, T1d, T2r, T2p, T2s, T1K, T6l, T6m; { E T15, T2o, T2P, T2z, T2l, T1c, T1A, T1J, T2D, T2L, T2J, T2M, T2C, T2E, T2N; E T2F; { E T2H, T2I, T2x, T2y, T2A, T2B; T15 = FMA(KP707106781, T14, T11); T2x = FNMS(KP707106781, T14, T11); T2y = T2n - T2m; T2o = T2m + T2n; Ci[WS(csi, 4)] = FMA(KP923879532, T3w, T3t); Ci[WS(csi, 28)] = FMS(KP923879532, T3w, T3t); Ci[WS(csi, 20)] = FMA(KP923879532, T3s, T3r); Ci[WS(csi, 12)] = FMS(KP923879532, T3s, T3r); T2P = FNMS(KP923879532, T2y, T2x); T2z = FMA(KP923879532, T2y, T2x); T2l = FMA(KP707106781, T2k, T2j); T2H = FNMS(KP707106781, T2k, T2j); T2I = T1b - T18; T1c = T18 + T1b; T1A = FMA(KP707106781, T1z, T1k); T2A = FNMS(KP707106781, T1z, T1k); T2B = FNMS(KP707106781, T1I, T1H); T1J = FMA(KP707106781, T1I, T1H); T27 = FMA(KP707106781, T26, T1R); T2D = FNMS(KP707106781, T26, T1R); T2L = FNMS(KP923879532, T2I, T2H); T2J = FMA(KP923879532, T2I, T2H); T2M = FMA(KP668178637, T2A, T2B); T2C = FNMS(KP668178637, T2B, T2A); T2E = FNMS(KP707106781, T2f, T2e); T2g = FMA(KP707106781, T2f, T2e); } T2N = FNMS(KP668178637, T2D, T2E); T2F = FMA(KP668178637, T2E, T2D); T2v = FNMS(KP923879532, T1c, T15); T1d = FMA(KP923879532, T1c, T15); { E T2Q, T2O, T2K, T2G; T2Q = T2M - T2N; T2O = T2M + T2N; T2K = T2F - T2C; T2G = T2C + T2F; Cr[WS(csr, 10)] = FMA(KP831469612, T2Q, T2P); Cr[WS(csr, 22)] = FNMS(KP831469612, T2Q, T2P); Ci[WS(csi, 26)] = FNMS(KP831469612, T2O, T2L); Ci[WS(csi, 6)] = -(FMA(KP831469612, T2O, T2L)); Ci[WS(csi, 22)] = FMS(KP831469612, T2K, T2J); Ci[WS(csi, 10)] = FMA(KP831469612, T2K, T2J); Cr[WS(csr, 6)] = FMA(KP831469612, T2G, T2z); Cr[WS(csr, 26)] = FNMS(KP831469612, T2G, T2z); } T2r = FMA(KP923879532, T2o, T2l); T2p = FNMS(KP923879532, T2o, T2l); T2s = FNMS(KP198912367, T1A, T1J); T1K = FMA(KP198912367, T1J, T1A); } { E T63, T5r, T5R, T6d, T5J, T5M, T6e, T5y, T6j, T6b, T66, T67, T64, T5U, T5Z; E T5G; { E T5S, T5u, T5x, T5T, T2t, T2h; T63 = FMA(KP923879532, T5q, T5p); T5r = FNMS(KP923879532, T5q, T5p); T5R = FNMS(KP923879532, T5Q, T5P); T6d = FMA(KP923879532, T5Q, T5P); T2t = FMA(KP198912367, T27, T2g); T2h = FNMS(KP198912367, T2g, T27); T5S = FNMS(KP668178637, T5s, T5t); T5u = FMA(KP668178637, T5t, T5s); { E T2w, T2u, T2q, T2i; T2w = T2t - T2s; T2u = T2s + T2t; T2q = T2h - T1K; T2i = T1K + T2h; Cr[WS(csr, 14)] = FMA(KP980785280, T2w, T2v); Cr[WS(csr, 18)] = FNMS(KP980785280, T2w, T2v); Ci[WS(csi, 30)] = FMS(KP980785280, T2u, T2r); Ci[WS(csi, 2)] = FMA(KP980785280, T2u, T2r); Ci[WS(csi, 18)] = FMA(KP980785280, T2q, T2p); Ci[WS(csi, 14)] = FMS(KP980785280, T2q, T2p); Cr[WS(csr, 2)] = FMA(KP980785280, T2i, T1d); Cr[WS(csr, 30)] = FNMS(KP980785280, T2i, T1d); T5x = FNMS(KP668178637, T5w, T5v); T5T = FMA(KP668178637, T5v, T5w); } { E T69, T6a, T5C, T5F; T5J = FNMS(KP923879532, T5I, T5H); T69 = FMA(KP923879532, T5I, T5H); T6a = FNMS(KP923879532, T5L, T5K); T5M = FMA(KP923879532, T5L, T5K); T6e = T5x + T5u; T5y = T5u - T5x; T6j = FNMS(KP303346683, T69, T6a); T6b = FMA(KP303346683, T6a, T69); T66 = FMA(KP923879532, T5B, T5A); T5C = FNMS(KP923879532, T5B, T5A); T5F = FNMS(KP923879532, T5E, T5D); T67 = FMA(KP923879532, T5E, T5D); T64 = T5T + T5S; T5U = T5S - T5T; T5Z = FMA(KP534511135, T5C, T5F); T5G = FNMS(KP534511135, T5F, T5C); } } { E T61, T6i, T68, T62; { E T5z, T5Y, T5N, T5X, T5V, T60, T5W, T5O; T61 = FNMS(KP831469612, T5y, T5r); T5z = FMA(KP831469612, T5y, T5r); T6i = FNMS(KP303346683, T66, T67); T68 = FMA(KP303346683, T67, T66); T5Y = FMA(KP534511135, T5J, T5M); T5N = FNMS(KP534511135, T5M, T5J); T5X = FNMS(KP831469612, T5U, T5R); T5V = FMA(KP831469612, T5U, T5R); T60 = T5Y - T5Z; T62 = T5Z + T5Y; T5W = T5N - T5G; T5O = T5G + T5N; Ci[WS(csi, 27)] = FMA(KP881921264, T60, T5X); Ci[WS(csi, 5)] = FMS(KP881921264, T60, T5X); Cr[WS(csr, 5)] = FMA(KP881921264, T5O, T5z); Cr[WS(csr, 27)] = FNMS(KP881921264, T5O, T5z); Ci[WS(csi, 21)] = FMS(KP881921264, T5W, T5V); Ci[WS(csi, 11)] = FMA(KP881921264, T5W, T5V); } { E T6g, T6f, T6h, T6k, T65, T6c; T6l = FNMS(KP831469612, T64, T63); T65 = FMA(KP831469612, T64, T63); T6c = T68 + T6b; T6g = T6b - T68; T6f = FNMS(KP831469612, T6e, T6d); T6h = FMA(KP831469612, T6e, T6d); Cr[WS(csr, 11)] = FMA(KP881921264, T62, T61); Cr[WS(csr, 21)] = FNMS(KP881921264, T62, T61); Cr[WS(csr, 3)] = FMA(KP956940335, T6c, T65); Cr[WS(csr, 29)] = FNMS(KP956940335, T6c, T65); T6k = T6i - T6j; T6m = T6i + T6j; Ci[WS(csi, 29)] = FMS(KP956940335, T6k, T6h); Ci[WS(csi, 3)] = FMA(KP956940335, T6k, T6h); Ci[WS(csi, 19)] = FMA(KP956940335, T6g, T6f); Ci[WS(csi, 13)] = FMS(KP956940335, T6g, T6f); } } } { E T55, T3L, T4T, T5f, T4D, T4K, T5g, T44, T5l, T5d, T58, T59, T56, T4W, T51; E T4q; { E T4U, T3U, T43, T4V; T55 = FNMS(KP923879532, T3K, T3D); T3L = FMA(KP923879532, T3K, T3D); T4T = FMA(KP923879532, T4S, T4P); T5f = FNMS(KP923879532, T4S, T4P); Cr[WS(csr, 13)] = FNMS(KP956940335, T6m, T6l); Cr[WS(csr, 19)] = FMA(KP956940335, T6m, T6l); T4U = FMA(KP198912367, T3Q, T3T); T3U = FNMS(KP198912367, T3T, T3Q); T43 = FMA(KP198912367, T42, T3Z); T4V = FNMS(KP198912367, T3Z, T42); { E T5b, T5c, T4i, T4p; T4D = FMA(KP923879532, T4C, T4v); T5b = FNMS(KP923879532, T4C, T4v); T5c = FNMS(KP923879532, T4J, T4G); T4K = FMA(KP923879532, T4J, T4G); T5g = T43 - T3U; T44 = T3U + T43; T5l = FNMS(KP820678790, T5b, T5c); T5d = FMA(KP820678790, T5c, T5b); T58 = FNMS(KP923879532, T4h, T4a); T4i = FMA(KP923879532, T4h, T4a); T4p = FMA(KP923879532, T4o, T4l); T59 = FNMS(KP923879532, T4o, T4l); T56 = T4U - T4V; T4W = T4U + T4V; T51 = FMA(KP098491403, T4i, T4p); T4q = FNMS(KP098491403, T4p, T4i); } } { E T53, T5k, T5a, T54; { E T45, T50, T4L, T4Z, T4X, T52, T4Y, T4M; T53 = FNMS(KP980785280, T44, T3L); T45 = FMA(KP980785280, T44, T3L); T5k = FNMS(KP820678790, T58, T59); T5a = FMA(KP820678790, T59, T58); T50 = FMA(KP098491403, T4D, T4K); T4L = FNMS(KP098491403, T4K, T4D); T4Z = FMA(KP980785280, T4W, T4T); T4X = FNMS(KP980785280, T4W, T4T); T52 = T50 - T51; T54 = T51 + T50; T4Y = T4L - T4q; T4M = T4q + T4L; Ci[WS(csi, 31)] = FMA(KP995184726, T52, T4Z); Ci[WS(csi, 1)] = FMS(KP995184726, T52, T4Z); Cr[WS(csr, 1)] = FMA(KP995184726, T4M, T45); Cr[WS(csr, 31)] = FNMS(KP995184726, T4M, T45); Ci[WS(csi, 17)] = FMS(KP995184726, T4Y, T4X); Ci[WS(csi, 15)] = FMA(KP995184726, T4Y, T4X); } { E T5i, T5h, T5j, T5m, T57, T5e; T5n = FNMS(KP980785280, T56, T55); T57 = FMA(KP980785280, T56, T55); T5e = T5a + T5d; T5i = T5d - T5a; T5h = FNMS(KP980785280, T5g, T5f); T5j = FMA(KP980785280, T5g, T5f); Cr[WS(csr, 15)] = FMA(KP995184726, T54, T53); Cr[WS(csr, 17)] = FNMS(KP995184726, T54, T53); Cr[WS(csr, 7)] = FMA(KP773010453, T5e, T57); Cr[WS(csr, 25)] = FNMS(KP773010453, T5e, T57); T5m = T5k - T5l; T5o = T5k + T5l; Ci[WS(csi, 25)] = FMS(KP773010453, T5m, T5j); Ci[WS(csi, 7)] = FMA(KP773010453, T5m, T5j); Ci[WS(csi, 23)] = FMA(KP773010453, T5i, T5h); Ci[WS(csi, 9)] = FMS(KP773010453, T5i, T5h); } } } } } } Cr[WS(csr, 9)] = FNMS(KP773010453, T5o, T5n); Cr[WS(csr, 23)] = FMA(KP773010453, T5o, T5n); } }
/* Included from initramfs et al code */ STATIC int INIT gunzip(unsigned char *buf, long len, long (*fill)(void*, unsigned long), long (*flush)(void*, unsigned long), unsigned char *out_buf, long *pos, void(*error)(char *x)) { u8 *zbuf; struct z_stream_s *strm; int rc; size_t out_len; rc = -1; if (flush) { out_len = 0x8000; /* 32 K */ out_buf = malloc(out_len); } else { out_len = 0x7fffffff; /* no limit */ } if (!out_buf) { error("Out of memory while allocating output buffer"); goto gunzip_nomem1; } if (buf) zbuf = buf; else { zbuf = malloc(GZIP_IOBUF_SIZE); len = 0; } if (!zbuf) { error("Out of memory while allocating input buffer"); goto gunzip_nomem2; } strm = malloc(sizeof(*strm)); if (strm == NULL) { error("Out of memory while allocating z_stream"); goto gunzip_nomem3; } strm->workspace = malloc(flush ? zlib_inflate_workspacesize() : sizeof(struct inflate_state)); if (strm->workspace == NULL) { error("Out of memory while allocating workspace"); goto gunzip_nomem4; } if (!fill) fill = nofill; if (len == 0) len = fill(zbuf, GZIP_IOBUF_SIZE); /* verify the gzip header */ if (len < 10 || zbuf[0] != 0x1f || zbuf[1] != 0x8b || zbuf[2] != 0x08) { if (pos) *pos = 0; error("Not a gzip file"); goto gunzip_5; } /* skip over gzip header (1f,8b,08... 10 bytes total + * possible asciz filename) */ strm->next_in = zbuf + 10; strm->avail_in = len - 10; /* skip over asciz filename */ if (zbuf[3] & 0x8) { do { /* * If the filename doesn't fit into the buffer, * the file is very probably corrupt. Don't try * to read more data. */ if (strm->avail_in == 0) { error("header error"); goto gunzip_5; } --strm->avail_in; } while (*strm->next_in++); } strm->next_out = out_buf; strm->avail_out = out_len; rc = zlib_inflateInit2(strm, -MAX_WBITS); if (!flush) { WS(strm)->inflate_state.wsize = 0; WS(strm)->inflate_state.window = NULL; } while (rc == Z_OK) { if (strm->avail_in == 0) { /* TODO: handle case where both pos and fill are set */ len = fill(zbuf, GZIP_IOBUF_SIZE); if (len < 0) { rc = -1; error("read error"); break; } strm->next_in = zbuf; strm->avail_in = len; } rc = zlib_inflate(strm, 0); /* Write any data generated */ if (flush && strm->next_out > out_buf) { long l = strm->next_out - out_buf; if (l != flush(out_buf, l)) { rc = -1; error("write error"); break; } strm->next_out = out_buf; strm->avail_out = out_len; } /* after Z_FINISH, only Z_STREAM_END is "we unpacked it all" */ if (rc == Z_STREAM_END) { rc = 0; break; } else if (rc != Z_OK) { error("uncompression error"); rc = -1; } } zlib_inflateEnd(strm); if (pos) /* add + 8 to skip over trailer */ *pos = strm->next_in - zbuf+8; gunzip_5: free(strm->workspace); gunzip_nomem4: free(strm); gunzip_nomem3: if (!buf) free(zbuf); gunzip_nomem2: if (flush) free(out_buf); gunzip_nomem1: return rc; /* returns Z_OK (0) if successful */ }
static void r2cbIII_20(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP951056516, +0.951056516295153572116439333379382143405698634); DK(KP559016994, +0.559016994374947424102293417182819058860154590); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP250000000, +0.250000000000000000000000000000000000000000000); DK(KP618033988, +0.618033988749894848204586834365638117720309180); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E TZ, TD, TW, Tw, Tt, TF, T1f, T1b; { E T1l, Tk, T9, Tj, Ta, TV, TI, Ts, TU, T1t, T11, Tx, T13, TC, T1a; E T1i, Th, Tv, Ty; { E TQ, TS, Tr, Tm, Tn; { E T1, T5, T6, T2, T3, T7, TY; T1 = Cr[WS(csr, 2)]; T5 = Cr[WS(csr, 9)]; T6 = Cr[WS(csr, 5)]; T2 = Cr[WS(csr, 6)]; T3 = Cr[WS(csr, 1)]; TQ = Ci[WS(csi, 2)]; T7 = T5 + T6; TY = T5 - T6; { E T4, TX, T8, Tp, Tq; T4 = T2 + T3; TX = T2 - T3; Tp = Ci[WS(csi, 5)]; Tq = Ci[WS(csi, 9)]; T1l = FNMS(KP618033988, TX, TY); TZ = FMA(KP618033988, TY, TX); Tk = T4 - T7; T8 = T4 + T7; TS = Tp + Tq; Tr = Tp - Tq; T9 = T1 + T8; Tj = FNMS(KP250000000, T8, T1); Tm = Ci[WS(csi, 6)]; Tn = Ci[WS(csi, 1)]; } } { E Tb, T19, Tg, Tc; Ta = Cr[WS(csr, 7)]; { E Te, Tf, To, TR, TT; Te = Cr[0]; Tf = Cr[WS(csr, 4)]; To = Tm + Tn; TR = Tm - Tn; Tb = Cr[WS(csr, 3)]; T19 = Te - Tf; Tg = Te + Tf; TT = TR - TS; TV = TR + TS; TI = FNMS(KP618033988, To, Tr); Ts = FMA(KP618033988, Tr, To); TU = FNMS(KP250000000, TT, TQ); T1t = TT + TQ; Tc = Cr[WS(csr, 8)]; } T11 = Ci[WS(csi, 7)]; { E TA, TB, Td, T18; TA = Ci[WS(csi, 4)]; TB = Ci[0]; Td = Tb + Tc; T18 = Tb - Tc; Tx = Ci[WS(csi, 3)]; T13 = TB + TA; TC = TA - TB; T1a = FMA(KP618033988, T19, T18); T1i = FNMS(KP618033988, T18, T19); Th = Td + Tg; Tv = Td - Tg; Ty = Ci[WS(csi, 8)]; } } } { E Tu, T1w, T16, TL, T15, T1u; { E Ti, T12, Tz, T14; Tu = FNMS(KP250000000, Th, Ta); Ti = Ta + Th; T12 = Tx - Ty; Tz = Tx + Ty; T1w = T9 - Ti; T14 = T12 - T13; T16 = T12 + T13; TL = FNMS(KP618033988, Tz, TC); TD = FMA(KP618033988, TC, Tz); T15 = FNMS(KP250000000, T14, T11); T1u = T14 + T11; R0[0] = KP2_000000000 * (T9 + Ti); } { E Tl, TJ, TN, T1q, T1m, TK, T1h, T17, TH, T1k, T1v; Tl = FMA(KP559016994, Tk, Tj); TH = FNMS(KP559016994, Tk, Tj); T1k = FNMS(KP559016994, TV, TU); TW = FMA(KP559016994, TV, TU); R0[WS(rs, 5)] = KP2_000000000 * (T1u - T1t); T1v = T1t + T1u; TJ = FNMS(KP951056516, TI, TH); TN = FMA(KP951056516, TI, TH); T1q = FMA(KP951056516, T1l, T1k); T1m = FNMS(KP951056516, T1l, T1k); R1[WS(rs, 7)] = KP1_414213562 * (T1w + T1v); R1[WS(rs, 2)] = KP1_414213562 * (T1v - T1w); Tw = FMA(KP559016994, Tv, Tu); TK = FNMS(KP559016994, Tv, Tu); T1h = FNMS(KP559016994, T16, T15); T17 = FMA(KP559016994, T16, T15); { E TM, TO, T1j, T1r; TM = FMA(KP951056516, TL, TK); TO = FNMS(KP951056516, TL, TK); T1j = FMA(KP951056516, T1i, T1h); T1r = FNMS(KP951056516, T1i, T1h); Tt = FNMS(KP951056516, Ts, Tl); TF = FMA(KP951056516, Ts, Tl); { E T1n, T1p, T1s, T1o; T1n = TN - TO; R0[WS(rs, 6)] = -(KP2_000000000 * (TN + TO)); T1p = TM - TJ; R0[WS(rs, 4)] = KP2_000000000 * (TJ + TM); T1s = T1q + T1r; R0[WS(rs, 9)] = KP2_000000000 * (T1r - T1q); T1o = T1m + T1j; R0[WS(rs, 1)] = KP2_000000000 * (T1j - T1m); R1[WS(rs, 6)] = KP1_414213562 * (T1p + T1s); R1[WS(rs, 1)] = KP1_414213562 * (T1p - T1s); R1[WS(rs, 3)] = KP1_414213562 * (T1n + T1o); R1[WS(rs, 8)] = KP1_414213562 * (T1n - T1o); T1f = FMA(KP951056516, T1a, T17); T1b = FNMS(KP951056516, T1a, T17); } } } } } { E TE, TG, T10, T1e; TE = FMA(KP951056516, TD, Tw); TG = FNMS(KP951056516, TD, Tw); T10 = FMA(KP951056516, TZ, TW); T1e = FNMS(KP951056516, TZ, TW); { E T1d, TP, T1g, T1c; T1d = TF - TG; R0[WS(rs, 2)] = -(KP2_000000000 * (TF + TG)); TP = Tt - TE; R0[WS(rs, 8)] = KP2_000000000 * (Tt + TE); T1g = T1e + T1f; R0[WS(rs, 7)] = KP2_000000000 * (T1e - T1f); T1c = T10 + T1b; R0[WS(rs, 3)] = KP2_000000000 * (T10 - T1b); R1[WS(rs, 9)] = -(KP1_414213562 * (T1d + T1g)); R1[WS(rs, 4)] = KP1_414213562 * (T1d - T1g); R1[WS(rs, 5)] = -(KP1_414213562 * (TP + T1c)); R1[0] = KP1_414213562 * (TP - T1c); } } } }
static void hc2cb_4(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { { INT m; for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(rs)) { E Th, Ta, T7, Ti, T9; { E Tq, Td, T3, Tg, Tu, Tm, T6, Tp; { E Tk, T4, Tl, T5; { E Tb, Tc, T1, T2, Te, Tf; Tb = Ip[0]; Tc = Im[WS(rs, 1)]; T1 = Rp[0]; T2 = Rm[WS(rs, 1)]; Te = Ip[WS(rs, 1)]; Tq = Tb + Tc; Td = Tb - Tc; Tf = Im[0]; Tk = T1 - T2; T3 = T1 + T2; T4 = Rp[WS(rs, 1)]; Tg = Te - Tf; Tl = Te + Tf; T5 = Rm[0]; } Tu = Tk + Tl; Tm = Tk - Tl; T6 = T4 + T5; Tp = T4 - T5; } Rm[0] = Td + Tg; { E Tx, Tr, T8, Tn, Ts, To, Tj; Tj = W[0]; Tx = Tq - Tp; Tr = Tp + Tq; Rp[0] = T3 + T6; T8 = T3 - T6; Tn = Tj * Tm; Ts = Tj * Tr; To = W[1]; { E Tt, Tw, Ty, Tv; Tt = W[4]; Tw = W[5]; Th = Td - Tg; Im[0] = FMA(To, Tm, Ts); Ip[0] = FNMS(To, Tr, Tn); Ty = Tt * Tx; Tv = Tt * Tu; Ta = W[3]; T7 = W[2]; Im[WS(rs, 1)] = FMA(Tw, Tu, Ty); Ip[WS(rs, 1)] = FNMS(Tw, Tx, Tv); Ti = Ta * T8; T9 = T7 * T8; } } } Rm[WS(rs, 1)] = FMA(T7, Th, Ti); Rp[WS(rs, 1)] = FNMS(Ta, Th, T9); } } }
static const R *hb_6(R *rio, R *iio, const R *W, stride ios, INT m, INT dist) { DK(KP866025403, +0.866025403784438646763723170752936183471402627); DK(KP500000000, +0.500000000000000000000000000000000000000000000); INT i; for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 10, MAKE_VOLATILE_STRIDE(ios)) { E Tk, Tt, Tz, Tv, Tb, TA, Tm, Tl, Tu; { E Tp, TR, Tj, Tq, TN, TV, T3, TC, T7, T8, T4, T5; { E TJ, TK, Ti, Tf, TL, Td, Te, T1, T2, TM; { E Tn, To, Tg, Th; Tn = iio[0]; To = rio[WS(ios, 3)]; Tg = iio[-WS(ios, 2)]; Th = rio[WS(ios, 5)]; Td = iio[-WS(ios, 1)]; TJ = Tn + To; Tp = Tn - To; Te = rio[WS(ios, 4)]; TK = Tg + Th; Ti = Tg - Th; } Tf = Td - Te; TL = Te + Td; T1 = rio[0]; T2 = iio[-WS(ios, 3)]; TR = TL + TK; TM = TK - TL; Tj = Tf - Ti; Tq = Tf + Ti; TN = TJ + TM; TV = FNMS(KP500000000, TM, TJ); T3 = T1 + T2; TC = T1 - T2; T7 = iio[-WS(ios, 4)]; T8 = rio[WS(ios, 1)]; T4 = rio[WS(ios, 2)]; T5 = iio[-WS(ios, 5)]; } { E TZ, T12, Ts, TG, T10, TX, Tc, TS, T13, T11, TP, TU, T14; iio[-WS(ios, 5)] = Tp + Tq; { E T9, TE, T6, TD; T9 = T7 + T8; TE = T7 - T8; T6 = T4 + T5; TD = T4 - T5; TZ = W[8]; T12 = W[9]; { E TW, TF, Ta, TQ; TW = TD - TE; TF = TD + TE; Ts = T6 - T9; Ta = T6 + T9; TG = TC + TF; TQ = FNMS(KP500000000, TF, TC); T10 = FNMS(KP866025403, TW, TV); TX = FMA(KP866025403, TW, TV); rio[0] = T3 + Ta; Tc = FNMS(KP500000000, Ta, T3); TS = FNMS(KP866025403, TR, TQ); T13 = FMA(KP866025403, TR, TQ); T11 = TZ * T10; } } T14 = TZ * T13; iio[0] = FMA(T12, T13, T11); TP = W[0]; TU = W[1]; rio[WS(ios, 5)] = FNMS(T12, T10, T14); { E TB, TI, TH, TY, TT; TY = TP * TX; TT = TP * TS; TB = W[4]; iio[-WS(ios, 4)] = FMA(TU, TS, TY); rio[WS(ios, 1)] = FNMS(TU, TX, TT); TI = W[5]; TH = TB * TG; { E Ty, Tx, Tr, Tw, TO; Tr = FNMS(KP500000000, Tq, Tp); Tw = FNMS(KP866025403, Tj, Tc); Tk = FMA(KP866025403, Tj, Tc); TO = TI * TG; rio[WS(ios, 3)] = FNMS(TI, TN, TH); Tt = FMA(KP866025403, Ts, Tr); Tz = FNMS(KP866025403, Ts, Tr); iio[-WS(ios, 2)] = FMA(TB, TN, TO); Tv = W[2]; Ty = W[3]; Tb = W[6]; Tx = Tv * Tw; TA = Ty * Tw; Tm = W[7]; Tl = Tb * Tk; rio[WS(ios, 2)] = FNMS(Ty, Tz, Tx); } } } } iio[-WS(ios, 3)] = FMA(Tv, Tz, TA); Tu = Tm * Tk; rio[WS(ios, 4)] = FNMS(Tm, Tt, Tl); iio[-WS(ios, 1)] = FMA(Tb, Tt, Tu); } return W; }
static void hc2cbdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT m; for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(rs)) { E T7, T1d, T1h, Tl, TG, T14, T19, TO, Te, TL, T18, T15, TB, T1e, Tw; E T1i; { E T3, TC, Tk, TM, T6, Th, TF, TN; { E T1, T2, Ti, Tj; T1 = Rp[0]; T2 = Rm[WS(rs, 3)]; T3 = T1 + T2; TC = T1 - T2; Ti = Ip[0]; Tj = Im[WS(rs, 3)]; Tk = Ti + Tj; TM = Ti - Tj; } { E T4, T5, TD, TE; T4 = Rp[WS(rs, 2)]; T5 = Rm[WS(rs, 1)]; T6 = T4 + T5; Th = T4 - T5; TD = Ip[WS(rs, 2)]; TE = Im[WS(rs, 1)]; TF = TD + TE; TN = TD - TE; } T7 = T3 + T6; T1d = Tk - Th; T1h = TC + TF; Tl = Th + Tk; TG = TC - TF; T14 = T3 - T6; T19 = TM - TN; TO = TM + TN; } { E Ta, Tm, Tp, TJ, Td, Tr, Tu, TK; { E T8, T9, Tn, To; T8 = Rp[WS(rs, 1)]; T9 = Rm[WS(rs, 2)]; Ta = T8 + T9; Tm = T8 - T9; Tn = Ip[WS(rs, 1)]; To = Im[WS(rs, 2)]; Tp = Tn + To; TJ = Tn - To; } { E Tb, Tc, Ts, Tt; Tb = Rm[0]; Tc = Rp[WS(rs, 3)]; Td = Tb + Tc; Tr = Tb - Tc; Ts = Im[0]; Tt = Ip[WS(rs, 3)]; Tu = Ts + Tt; TK = Tt - Ts; } Te = Ta + Td; TL = TJ + TK; T18 = Ta - Td; T15 = TK - TJ; { E Tz, TA, Tq, Tv; Tz = Tm - Tp; TA = Tr - Tu; TB = KP707106781 * (Tz + TA); T1e = KP707106781 * (Tz - TA); Tq = Tm + Tp; Tv = Tr + Tu; Tw = KP707106781 * (Tq - Tv); T1i = KP707106781 * (Tq + Tv); } } { E Tf, TP, TI, TQ; Tf = T7 + Te; TP = TL + TO; { E Tx, TH, Tg, Ty; Tx = Tl + Tw; TH = TB + TG; Tg = W[0]; Ty = W[1]; TI = FMA(Tg, Tx, Ty * TH); TQ = FNMS(Ty, Tx, Tg * TH); } Rp[0] = Tf - TI; Ip[0] = TP + TQ; Rm[0] = Tf + TI; Im[0] = TQ - TP; } { E T1r, T1x, T1w, T1y; { E T1o, T1q, T1n, T1p; T1o = T14 - T15; T1q = T19 - T18; T1n = W[10]; T1p = W[11]; T1r = FNMS(T1p, T1q, T1n * T1o); T1x = FMA(T1p, T1o, T1n * T1q); } { E T1t, T1v, T1s, T1u; T1t = T1d - T1e; T1v = T1i + T1h; T1s = W[12]; T1u = W[13]; T1w = FMA(T1s, T1t, T1u * T1v); T1y = FNMS(T1u, T1t, T1s * T1v); } Rp[WS(rs, 3)] = T1r - T1w; Ip[WS(rs, 3)] = T1x + T1y; Rm[WS(rs, 3)] = T1r + T1w; Im[WS(rs, 3)] = T1y - T1x; } { E TV, T11, T10, T12; { E TS, TU, TR, TT; TS = T7 - Te; TU = TO - TL; TR = W[6]; TT = W[7]; TV = FNMS(TT, TU, TR * TS); T11 = FMA(TT, TS, TR * TU); } { E TX, TZ, TW, TY; TX = Tl - Tw; TZ = TG - TB; TW = W[8]; TY = W[9]; T10 = FMA(TW, TX, TY * TZ); T12 = FNMS(TY, TX, TW * TZ); } Rp[WS(rs, 2)] = TV - T10; Ip[WS(rs, 2)] = T11 + T12; Rm[WS(rs, 2)] = TV + T10; Im[WS(rs, 2)] = T12 - T11; } { E T1b, T1l, T1k, T1m; { E T16, T1a, T13, T17; T16 = T14 + T15; T1a = T18 + T19; T13 = W[2]; T17 = W[3]; T1b = FNMS(T17, T1a, T13 * T16); T1l = FMA(T17, T16, T13 * T1a); } { E T1f, T1j, T1c, T1g; T1f = T1d + T1e; T1j = T1h - T1i; T1c = W[4]; T1g = W[5]; T1k = FMA(T1c, T1f, T1g * T1j); T1m = FNMS(T1g, T1f, T1c * T1j); } Rp[WS(rs, 1)] = T1b - T1k; Ip[WS(rs, 1)] = T1l + T1m; Rm[WS(rs, 1)] = T1b + T1k; Im[WS(rs, 1)] = T1m - T1l; } } } }
static void q1_2(float *rio, float *iio, const float *W, stride rs, stride vs, INT mb, INT me, INT ms) { INT m; for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(vs)) { E T9, T6, T5; { E T1, T2, T7, T8, Tb, T4, Tc, Th, Ti, Te, Tj, Td, Tg; T1 = rio[0]; T2 = rio[WS(rs, 1)]; T7 = iio[0]; T8 = iio[WS(rs, 1)]; Tb = rio[WS(vs, 1)]; T4 = T1 - T2; Tc = rio[WS(vs, 1) + WS(rs, 1)]; T9 = T7 - T8; Th = iio[WS(vs, 1)]; Ti = iio[WS(vs, 1) + WS(rs, 1)]; Te = Tb - Tc; rio[0] = T1 + T2; iio[0] = T7 + T8; Tj = Th - Ti; rio[WS(rs, 1)] = Tb + Tc; iio[WS(rs, 1)] = Th + Ti; Td = W[0]; Tg = W[1]; { E T3, Tk, Tf, Ta; T3 = W[0]; T6 = W[1]; Tk = Td * Tj; Tf = Td * Te; Ta = T3 * T9; T5 = T3 * T4; iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Tg, Te, Tk); rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tg, Tj, Tf); iio[WS(vs, 1)] = FNMS(T6, T4, Ta); } } rio[WS(vs, 1)] = FMA(T6, T9, T5); } }
static void t2fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP382683432, +0.382683432365089771728459984030398866761344562); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT m; R *x; x = ri; for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) { V TJ, T10, TD, T11, T1b, T1c, Ty, TK, T16, T17, T18, Tb, TN, T13, T14; V T15, Tm, TM, TG, TI, TH; TG = LD(&(x[0]), ms, &(x[0])); TH = LD(&(x[WS(rs, 8)]), ms, &(x[0])); TI = BYTWJ(&(W[TWVL * 14]), TH); TJ = VSUB(TG, TI); T10 = VADD(TG, TI); { V TA, TC, Tz, TB; Tz = LD(&(x[WS(rs, 4)]), ms, &(x[0])); TA = BYTWJ(&(W[TWVL * 6]), Tz); TB = LD(&(x[WS(rs, 12)]), ms, &(x[0])); TC = BYTWJ(&(W[TWVL * 22]), TB); TD = VSUB(TA, TC); T11 = VADD(TA, TC); } { V Tp, Tw, Tr, Tu, Ts, Tx; { V To, Tv, Tq, Tt; To = LD(&(x[WS(rs, 14)]), ms, &(x[0])); Tp = BYTWJ(&(W[TWVL * 26]), To); Tv = LD(&(x[WS(rs, 10)]), ms, &(x[0])); Tw = BYTWJ(&(W[TWVL * 18]), Tv); Tq = LD(&(x[WS(rs, 6)]), ms, &(x[0])); Tr = BYTWJ(&(W[TWVL * 10]), Tq); Tt = LD(&(x[WS(rs, 2)]), ms, &(x[0])); Tu = BYTWJ(&(W[TWVL * 2]), Tt); } T1b = VADD(Tp, Tr); T1c = VADD(Tu, Tw); Ts = VSUB(Tp, Tr); Tx = VSUB(Tu, Tw); Ty = VMUL(LDK(KP707106781), VSUB(Ts, Tx)); TK = VMUL(LDK(KP707106781), VADD(Tx, Ts)); } { V T2, T9, T4, T7, T5, Ta; { V T1, T8, T3, T6; T1 = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)])); T2 = BYTWJ(&(W[TWVL * 28]), T1); T8 = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)])); T9 = BYTWJ(&(W[TWVL * 20]), T8); T3 = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)])); T4 = BYTWJ(&(W[TWVL * 12]), T3); T6 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T7 = BYTWJ(&(W[TWVL * 4]), T6); } T16 = VADD(T2, T4); T17 = VADD(T7, T9); T18 = VSUB(T16, T17); T5 = VSUB(T2, T4); Ta = VSUB(T7, T9); Tb = VFNMS(LDK(KP923879532), Ta, VMUL(LDK(KP382683432), T5)); TN = VFMA(LDK(KP923879532), T5, VMUL(LDK(KP382683432), Ta)); } { V Td, Tk, Tf, Ti, Tg, Tl; { V Tc, Tj, Te, Th; Tc = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); Td = BYTWJ(&(W[0]), Tc); Tj = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)])); Tk = BYTWJ(&(W[TWVL * 24]), Tj); Te = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)])); Tf = BYTWJ(&(W[TWVL * 16]), Te); Th = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)])); Ti = BYTWJ(&(W[TWVL * 8]), Th); } T13 = VADD(Td, Tf); T14 = VADD(Ti, Tk); T15 = VSUB(T13, T14); Tg = VSUB(Td, Tf); Tl = VSUB(Ti, Tk); Tm = VFMA(LDK(KP382683432), Tg, VMUL(LDK(KP923879532), Tl)); TM = VFNMS(LDK(KP382683432), Tl, VMUL(LDK(KP923879532), Tg)); } { V T1a, T1g, T1f, T1h; { V T12, T19, T1d, T1e; T12 = VSUB(T10, T11); T19 = VMUL(LDK(KP707106781), VADD(T15, T18)); T1a = VADD(T12, T19); T1g = VSUB(T12, T19); T1d = VSUB(T1b, T1c); T1e = VMUL(LDK(KP707106781), VSUB(T18, T15)); T1f = VBYI(VADD(T1d, T1e)); T1h = VBYI(VSUB(T1e, T1d)); } ST(&(x[WS(rs, 14)]), VSUB(T1a, T1f), ms, &(x[0])); ST(&(x[WS(rs, 6)]), VADD(T1g, T1h), ms, &(x[0])); ST(&(x[WS(rs, 2)]), VADD(T1a, T1f), ms, &(x[0])); ST(&(x[WS(rs, 10)]), VSUB(T1g, T1h), ms, &(x[0])); } { V T1k, T1o, T1n, T1p; { V T1i, T1j, T1l, T1m; T1i = VADD(T10, T11); T1j = VADD(T1c, T1b); T1k = VADD(T1i, T1j); T1o = VSUB(T1i, T1j); T1l = VADD(T13, T14); T1m = VADD(T16, T17); T1n = VADD(T1l, T1m); T1p = VBYI(VSUB(T1m, T1l)); } ST(&(x[WS(rs, 8)]), VSUB(T1k, T1n), ms, &(x[0])); ST(&(x[WS(rs, 4)]), VADD(T1o, T1p), ms, &(x[0])); ST(&(x[0]), VADD(T1k, T1n), ms, &(x[0])); ST(&(x[WS(rs, 12)]), VSUB(T1o, T1p), ms, &(x[0])); } { V TF, TQ, TP, TR; { V Tn, TE, TL, TO; Tn = VSUB(Tb, Tm); TE = VSUB(Ty, TD); TF = VBYI(VSUB(Tn, TE)); TQ = VBYI(VADD(TE, Tn)); TL = VADD(TJ, TK); TO = VADD(TM, TN); TP = VSUB(TL, TO); TR = VADD(TL, TO); } ST(&(x[WS(rs, 7)]), VADD(TF, TP), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 15)]), VSUB(TR, TQ), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 9)]), VSUB(TP, TF), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 1)]), VADD(TQ, TR), ms, &(x[WS(rs, 1)])); } { V TU, TY, TX, TZ; { V TS, TT, TV, TW; TS = VSUB(TJ, TK); TT = VADD(Tm, Tb); TU = VADD(TS, TT); TY = VSUB(TS, TT); TV = VADD(TD, Ty); TW = VSUB(TN, TM); TX = VBYI(VADD(TV, TW)); TZ = VBYI(VSUB(TW, TV)); } ST(&(x[WS(rs, 13)]), VSUB(TU, TX), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 5)]), VADD(TY, TZ), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 3)]), VADD(TU, TX), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 11)]), VSUB(TY, TZ), ms, &(x[WS(rs, 1)])); } } } VLEAVE(); }
static void t2fv_16(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP923879532, +0.923879532511286756128183189396788286822416626); DVK(KP414213562, +0.414213562373095048801688724209698078569671875); DVK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT m; R *x; x = ri; for (m = mb, W = W + (mb * ((TWVL / VL) * 30)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 30), MAKE_VOLATILE_STRIDE(16, rs)) { V TO, Ta, TJ, TP, T14, Tq, T1i, T10, T1b, T1l, T13, T1c, TR, Tl, T15; V Tv; { V Tc, TW, T4, T19, T9, TD, TI, Tj, TZ, T1a, Te, Th, Tn, Tr, Tu; V Tp; { V T1, T2, T5, T7; T1 = LD(&(x[0]), ms, &(x[0])); T2 = LD(&(x[WS(rs, 8)]), ms, &(x[0])); T5 = LD(&(x[WS(rs, 4)]), ms, &(x[0])); T7 = LD(&(x[WS(rs, 12)]), ms, &(x[0])); { V Tz, TG, TB, TE; Tz = LD(&(x[WS(rs, 14)]), ms, &(x[0])); TG = LD(&(x[WS(rs, 10)]), ms, &(x[0])); TB = LD(&(x[WS(rs, 6)]), ms, &(x[0])); TE = LD(&(x[WS(rs, 2)]), ms, &(x[0])); { V Ti, TY, TX, Td, Tg, Tm, Tt, To; { V T3, T6, T8, TA, TH, TC, TF, Tb; Tb = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T3 = BYTWJ(&(W[TWVL * 14]), T2); T6 = BYTWJ(&(W[TWVL * 6]), T5); T8 = BYTWJ(&(W[TWVL * 22]), T7); TA = BYTWJ(&(W[TWVL * 26]), Tz); TH = BYTWJ(&(W[TWVL * 18]), TG); TC = BYTWJ(&(W[TWVL * 10]), TB); TF = BYTWJ(&(W[TWVL * 2]), TE); Tc = BYTWJ(&(W[0]), Tb); TW = VSUB(T1, T3); T4 = VADD(T1, T3); T19 = VSUB(T6, T8); T9 = VADD(T6, T8); Ti = LD(&(x[WS(rs, 13)]), ms, &(x[WS(rs, 1)])); TD = VADD(TA, TC); TY = VSUB(TA, TC); TI = VADD(TF, TH); TX = VSUB(TF, TH); } Td = LD(&(x[WS(rs, 9)]), ms, &(x[WS(rs, 1)])); Tg = LD(&(x[WS(rs, 5)]), ms, &(x[WS(rs, 1)])); Tm = LD(&(x[WS(rs, 15)]), ms, &(x[WS(rs, 1)])); Tj = BYTWJ(&(W[TWVL * 24]), Ti); Tt = LD(&(x[WS(rs, 11)]), ms, &(x[WS(rs, 1)])); To = LD(&(x[WS(rs, 7)]), ms, &(x[WS(rs, 1)])); TZ = VADD(TX, TY); T1a = VSUB(TY, TX); Te = BYTWJ(&(W[TWVL * 16]), Td); Th = BYTWJ(&(W[TWVL * 8]), Tg); Tn = BYTWJ(&(W[TWVL * 28]), Tm); Tr = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); Tu = BYTWJ(&(W[TWVL * 20]), Tt); Tp = BYTWJ(&(W[TWVL * 12]), To); } } } { V Tf, T11, Tk, T12, Ts; TO = VADD(T4, T9); Ta = VSUB(T4, T9); TJ = VSUB(TD, TI); TP = VADD(TI, TD); Tf = VADD(Tc, Te); T11 = VSUB(Tc, Te); Tk = VADD(Th, Tj); T12 = VSUB(Th, Tj); Ts = BYTWJ(&(W[TWVL * 4]), Tr); T14 = VSUB(Tn, Tp); Tq = VADD(Tn, Tp); T1i = VFNMS(LDK(KP707106781), TZ, TW); T10 = VFMA(LDK(KP707106781), TZ, TW); T1b = VFNMS(LDK(KP707106781), T1a, T19); T1l = VFMA(LDK(KP707106781), T1a, T19); T13 = VFNMS(LDK(KP414213562), T12, T11); T1c = VFMA(LDK(KP414213562), T11, T12); TR = VADD(Tf, Tk); Tl = VSUB(Tf, Tk); T15 = VSUB(Tu, Ts); Tv = VADD(Ts, Tu); } } { V T1d, T16, TS, Tw, TU, TQ; T1d = VFMA(LDK(KP414213562), T14, T15); T16 = VFNMS(LDK(KP414213562), T15, T14); TS = VADD(Tq, Tv); Tw = VSUB(Tq, Tv); TU = VSUB(TO, TP); TQ = VADD(TO, TP); { V T1e, T1j, T17, T1m; T1e = VSUB(T1c, T1d); T1j = VADD(T1c, T1d); T17 = VADD(T13, T16); T1m = VSUB(T16, T13); { V TV, TT, TK, Tx; TV = VSUB(TS, TR); TT = VADD(TR, TS); TK = VSUB(Tw, Tl); Tx = VADD(Tl, Tw); { V T1h, T1f, T1o, T1k; T1h = VFMA(LDK(KP923879532), T1e, T1b); T1f = VFNMS(LDK(KP923879532), T1e, T1b); T1o = VFMA(LDK(KP923879532), T1j, T1i); T1k = VFNMS(LDK(KP923879532), T1j, T1i); { V T1g, T18, T1p, T1n; T1g = VFMA(LDK(KP923879532), T17, T10); T18 = VFNMS(LDK(KP923879532), T17, T10); T1p = VFMA(LDK(KP923879532), T1m, T1l); T1n = VFNMS(LDK(KP923879532), T1m, T1l); ST(&(x[WS(rs, 12)]), VFNMSI(TV, TU), ms, &(x[0])); ST(&(x[WS(rs, 4)]), VFMAI(TV, TU), ms, &(x[0])); ST(&(x[0]), VADD(TQ, TT), ms, &(x[0])); ST(&(x[WS(rs, 8)]), VSUB(TQ, TT), ms, &(x[0])); { V TN, TL, TM, Ty; TN = VFMA(LDK(KP707106781), TK, TJ); TL = VFNMS(LDK(KP707106781), TK, TJ); TM = VFMA(LDK(KP707106781), Tx, Ta); Ty = VFNMS(LDK(KP707106781), Tx, Ta); ST(&(x[WS(rs, 1)]), VFNMSI(T1h, T1g), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 15)]), VFMAI(T1h, T1g), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 7)]), VFMAI(T1f, T18), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 9)]), VFNMSI(T1f, T18), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 3)]), VFMAI(T1p, T1o), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 13)]), VFNMSI(T1p, T1o), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 11)]), VFMAI(T1n, T1k), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 5)]), VFNMSI(T1n, T1k), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 14)]), VFNMSI(TN, TM), ms, &(x[0])); ST(&(x[WS(rs, 2)]), VFMAI(TN, TM), ms, &(x[0])); ST(&(x[WS(rs, 10)]), VFMAI(TL, Ty), ms, &(x[0])); ST(&(x[WS(rs, 6)]), VFNMSI(TL, Ty), ms, &(x[0])); } } } } } } } } VLEAVE(); }
static void r2cb_13(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_007074065, +1.007074065727533254493747707736933954186697125); DK(KP227708958, +0.227708958111581597949308691735310621069285120); DK(KP531932498, +0.531932498429674575175042127684371897596660533); DK(KP774781170, +0.774781170935234584261351932853525703557550433); DK(KP265966249, +0.265966249214837287587521063842185948798330267); DK(KP516520780, +0.516520780623489722840901288569017135705033622); DK(KP151805972, +0.151805972074387731966205794490207080712856746); DK(KP503537032, +0.503537032863766627246873853868466977093348562); DK(KP166666666, +0.166666666666666666666666666666666666666666667); DK(KP600925212, +0.600925212577331548853203544578415991041882762); DK(KP500000000, +0.500000000000000000000000000000000000000000000); DK(KP256247671, +0.256247671582936600958684654061725059144125175); DK(KP156891391, +0.156891391051584611046832726756003269660212636); DK(KP348277202, +0.348277202304271810011321589858529485233929352); DK(KP1_150281458, +1.150281458948006242736771094910906776922003215); DK(KP300238635, +0.300238635966332641462884626667381504676006424); DK(KP011599105, +0.011599105605768290721655456654083252189827041); DK(KP1_732050807, +1.732050807568877293527446341505872366942805254); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E TG, TS, TR, T15, TJ, TT, T1, Tm, Tc, Td, Tg, Tj, Tk, Tn, To; E Tp; { E Ts, Tv, Tw, TE, TC, TB, Tz, TD, TA, TF; { E Tt, Tu, Tx, Ty; Ts = Ci[WS(csi, 1)]; Tt = Ci[WS(csi, 3)]; Tu = Ci[WS(csi, 4)]; Tv = Tt - Tu; Tw = FMS(KP2_000000000, Ts, Tv); TE = KP1_732050807 * (Tt + Tu); TC = Ci[WS(csi, 5)]; Tx = Ci[WS(csi, 6)]; Ty = Ci[WS(csi, 2)]; TB = Tx + Ty; Tz = KP1_732050807 * (Tx - Ty); TD = FNMS(KP2_000000000, TC, TB); } TA = Tw + Tz; TF = TD - TE; TG = FMA(KP011599105, TA, KP300238635 * TF); TS = FNMS(KP011599105, TF, KP300238635 * TA); { E TP, TQ, TH, TI; TP = Ts + Tv; TQ = TB + TC; TR = FNMS(KP348277202, TQ, KP1_150281458 * TP); T15 = FMA(KP348277202, TP, KP1_150281458 * TQ); TH = Tw - Tz; TI = TE + TD; TJ = FMA(KP156891391, TH, KP256247671 * TI); TT = FNMS(KP256247671, TH, KP156891391 * TI); } } { E Tb, Ti, Tf, T6, Th, Te; T1 = Cr[0]; { E T7, T8, T9, Ta; T7 = Cr[WS(csr, 5)]; T8 = Cr[WS(csr, 2)]; T9 = Cr[WS(csr, 6)]; Ta = T8 + T9; Tb = T7 + Ta; Ti = FNMS(KP500000000, Ta, T7); Tf = T8 - T9; } { E T2, T3, T4, T5; T2 = Cr[WS(csr, 1)]; T3 = Cr[WS(csr, 3)]; T4 = Cr[WS(csr, 4)]; T5 = T3 + T4; T6 = T2 + T5; Th = FNMS(KP500000000, T5, T2); Te = T3 - T4; } Tm = KP600925212 * (T6 - Tb); Tc = T6 + Tb; Td = FNMS(KP166666666, Tc, T1); Tg = Te + Tf; Tj = Th + Ti; Tk = FMA(KP503537032, Tg, KP151805972 * Tj); Tn = Th - Ti; To = Te - Tf; Tp = FNMS(KP265966249, To, KP516520780 * Tn); } R0[0] = FMA(KP2_000000000, Tc, T1); { E TK, T1b, TV, T12, T16, T18, TO, T1a, Tr, T17, T11, T13; { E TU, T14, TM, TN; TK = KP1_732050807 * (TG + TJ); T1b = KP1_732050807 * (TS - TT); TU = TS + TT; TV = TR - TU; T12 = FMA(KP2_000000000, TU, TR); T14 = TG - TJ; T16 = FMS(KP2_000000000, T14, T15); T18 = T14 + T15; TM = FMA(KP774781170, To, KP531932498 * Tn); TN = FNMS(KP1_007074065, Tj, KP227708958 * Tg); TO = TM - TN; T1a = TM + TN; { E Tl, Tq, TZ, T10; Tl = Td - Tk; Tq = Tm - Tp; Tr = Tl - Tq; T17 = Tq + Tl; TZ = FMA(KP2_000000000, Tk, Td); T10 = FMA(KP2_000000000, Tp, Tm); T11 = TZ - T10; T13 = T10 + TZ; } } R1[WS(rs, 2)] = T11 - T12; R0[WS(rs, 6)] = T13 - T16; R1[0] = T13 + T16; R0[WS(rs, 4)] = T11 + T12; { E TL, TW, T19, T1c; TL = Tr - TK; TW = TO - TV; R1[WS(rs, 3)] = TL - TW; R0[WS(rs, 1)] = TL + TW; T19 = T17 - T18; T1c = T1a + T1b; R1[WS(rs, 1)] = T19 - T1c; R1[WS(rs, 4)] = T1c + T19; } { E T1d, T1e, TX, TY; T1d = T1a - T1b; T1e = T17 + T18; R0[WS(rs, 2)] = T1d + T1e; R0[WS(rs, 5)] = T1e - T1d; TX = Tr + TK; TY = TO + TV; R0[WS(rs, 3)] = TX - TY; R1[WS(rs, 5)] = TX + TY; } } } }
static void r2cb_13(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP968287244, +0.968287244361984016049539446938120421179794516); DK(KP875502302, +0.875502302409147941146295545768755143177842006); DK(KP1_150281458, +1.150281458948006242736771094910906776922003215); DK(KP1_040057143, +1.040057143777729238234261000998465604986476278); DK(KP1_200954543, +1.200954543865330565851538506669526018704025697); DK(KP769338817, +0.769338817572980603471413688209101117038278899); DK(KP600925212, +0.600925212577331548853203544578415991041882762); DK(KP1_033041561, +1.033041561246979445681802577138034271410067244); DK(KP1_007074065, +1.007074065727533254493747707736933954186697125); DK(KP503537032, +0.503537032863766627246873853868466977093348562); DK(KP581704778, +0.581704778510515730456870384989698884939833902); DK(KP859542535, +0.859542535098774820163672132761689612766401925); DK(KP166666666, +0.166666666666666666666666666666666666666666667); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP301479260, +0.301479260047709873958013540496673347309208464); DK(KP226109445, +0.226109445035782405468510155372505010481906348); DK(KP686558370, +0.686558370781754340655719594850823015421401653); DK(KP514918778, +0.514918778086315755491789696138117261566051239); DK(KP957805992, +0.957805992594665126462521754605754580515587217); DK(KP522026385, +0.522026385161275033714027226654165028300441940); DK(KP853480001, +0.853480001859823990758994934970528322872359049); DK(KP038632954, +0.038632954644348171955506895830342264440241080); DK(KP612264650, +0.612264650376756543746494474777125408779395514); DK(KP302775637, +0.302775637731994646559610633735247973125648287); DK(KP866025403, +0.866025403784438646763723170752936183471402627); DK(KP500000000, +0.500000000000000000000000000000000000000000000); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E TW, T14, TS, TO, T18, T1e, TY, TX, TQ, Tq, TP, Tl, T1d, Tr; { E T1, TN, T16, TJ, TV, TG, TU, Tf, T2, T3, Tb, Ti, T4; { E Ts, TB, Tx, Ty, Tv, TE, Tt, Tu, Tz, TC; Ts = Ci[WS(csi, 5)]; Tt = Ci[WS(csi, 2)]; Tu = Ci[WS(csi, 6)]; TB = Ci[WS(csi, 1)]; Tx = Ci[WS(csi, 3)]; Ty = Ci[WS(csi, 4)]; Tv = Tt + Tu; TE = Tu - Tt; T1 = Cr[0]; Tz = Tx + Ty; TC = Tx - Ty; { E TL, Tw, T7, Ta; TL = Ts + Tv; Tw = FNMS(KP500000000, Tv, Ts); T7 = Cr[WS(csr, 5)]; { E TD, TM, TA, TH; TD = FNMS(KP500000000, TC, TB); TM = TB + TC; TA = FMA(KP866025403, Tz, Tw); TH = FNMS(KP866025403, Tz, Tw); TN = FMA(KP302775637, TM, TL); T16 = FNMS(KP302775637, TL, TM); { E TF, TI, T8, T9; TF = FMA(KP866025403, TE, TD); TI = FNMS(KP866025403, TE, TD); T8 = Cr[WS(csr, 2)]; T9 = Cr[WS(csr, 6)]; TJ = FNMS(KP612264650, TI, TH); TV = FMA(KP612264650, TH, TI); TG = FNMS(KP038632954, TF, TA); TU = FMA(KP038632954, TA, TF); Tf = T8 - T9; Ta = T8 + T9; } } T2 = Cr[WS(csr, 1)]; T3 = Cr[WS(csr, 3)]; Tb = T7 + Ta; Ti = FMS(KP500000000, Ta, T7); T4 = Cr[WS(csr, 4)]; } } { E T17, TK, T5, Te, Tk, Td; TW = FMA(KP853480001, TV, TU); T17 = FNMS(KP853480001, TV, TU); TK = FNMS(KP853480001, TJ, TG); T14 = FMA(KP853480001, TJ, TG); T5 = T3 + T4; Te = T3 - T4; { E Tn, Tg, Th, T6; TS = FNMS(KP522026385, TK, TN); TO = FMA(KP957805992, TN, TK); Tn = Te - Tf; Tg = Te + Tf; Th = FNMS(KP500000000, T5, T2); T6 = T2 + T5; T18 = FNMS(KP522026385, T17, T16); T1e = FMA(KP957805992, T16, T17); { E Tm, Tj, Tc, Tp, To; Tm = Th + Ti; Tj = Th - Ti; Tc = T6 + Tb; Tp = T6 - Tb; To = FNMS(KP514918778, Tn, Tm); TY = FMA(KP686558370, Tm, Tn); TX = FNMS(KP226109445, Tg, Tj); Tk = FMA(KP301479260, Tj, Tg); R0[0] = FMA(KP2_000000000, Tc, T1); Td = FNMS(KP166666666, Tc, T1); TQ = FNMS(KP859542535, To, Tp); Tq = FMA(KP581704778, Tp, To); } } TP = FNMS(KP503537032, Tk, Td); Tl = FMA(KP1_007074065, Tk, Td); } } T1d = FNMS(KP1_033041561, Tq, Tl); Tr = FMA(KP1_033041561, Tq, Tl); { E T13, TR, T19, TZ; T13 = FNMS(KP600925212, TQ, TP); TR = FMA(KP600925212, TQ, TP); T19 = FMA(KP769338817, TY, TX); TZ = FNMS(KP769338817, TY, TX); R0[WS(rs, 4)] = FMA(KP1_200954543, T1e, T1d); R1[WS(rs, 2)] = FNMS(KP1_200954543, T1e, T1d); R0[WS(rs, 6)] = FMA(KP1_200954543, TO, Tr); R1[0] = FNMS(KP1_200954543, TO, Tr); { E T1b, T15, T11, TT; T1b = FNMS(KP1_040057143, T14, T13); T15 = FMA(KP1_040057143, T14, T13); T11 = FMA(KP1_150281458, TS, TR); TT = FNMS(KP1_150281458, TS, TR); { E T1c, T1a, T12, T10; T1c = FMA(KP875502302, T19, T18); T1a = FNMS(KP875502302, T19, T18); T12 = FMA(KP968287244, TZ, TW); T10 = FNMS(KP968287244, TZ, TW); R1[WS(rs, 5)] = FMA(KP1_150281458, T1c, T1b); R0[WS(rs, 3)] = FNMS(KP1_150281458, T1c, T1b); R1[WS(rs, 3)] = FMA(KP1_150281458, T1a, T15); R0[WS(rs, 1)] = FNMS(KP1_150281458, T1a, T15); R0[WS(rs, 5)] = FMA(KP1_040057143, T12, T11); R0[WS(rs, 2)] = FNMS(KP1_040057143, T12, T11); R1[WS(rs, 4)] = FMA(KP1_040057143, T10, TT); R1[WS(rs, 1)] = FNMS(KP1_040057143, T10, TT); } } } } }
static void t1bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP559016994, +0.559016994374947424102293417182819058860154590); DVK(KP250000000, +0.250000000000000000000000000000000000000000000); DVK(KP618033988, +0.618033988749894848204586834365638117720309180); DVK(KP951056516, +0.951056516295153572116439333379382143405698634); INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(rs)) { V T1, T2, T9, T4, T7; T1 = LD(&(x[0]), ms, &(x[0])); T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T9 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T4 = LD(&(x[WS(rs, 4)]), ms, &(x[0])); T7 = LD(&(x[WS(rs, 2)]), ms, &(x[0])); { V T3, Ta, T5, T8; T3 = BYTW(&(W[0]), T2); Ta = BYTW(&(W[TWVL * 4]), T9); T5 = BYTW(&(W[TWVL * 6]), T4); T8 = BYTW(&(W[TWVL * 2]), T7); { V T6, Tg, Tb, Th; T6 = VADD(T3, T5); Tg = VSUB(T3, T5); Tb = VADD(T8, Ta); Th = VSUB(T8, Ta); { V Te, Tc, Tk, Ti, Td, Tj, Tf; Te = VSUB(T6, Tb); Tc = VADD(T6, Tb); Tk = VMUL(LDK(KP951056516), VFNMS(LDK(KP618033988), Tg, Th)); Ti = VMUL(LDK(KP951056516), VFMA(LDK(KP618033988), Th, Tg)); Td = VFNMS(LDK(KP250000000), Tc, T1); ST(&(x[0]), VADD(T1, Tc), ms, &(x[0])); Tj = VFNMS(LDK(KP559016994), Te, Td); Tf = VFMA(LDK(KP559016994), Te, Td); ST(&(x[WS(rs, 2)]), VFNMSI(Tk, Tj), ms, &(x[0])); ST(&(x[WS(rs, 3)]), VFMAI(Tk, Tj), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 4)]), VFNMSI(Ti, Tf), ms, &(x[0])); ST(&(x[WS(rs, 1)]), VFMAI(Ti, Tf), ms, &(x[WS(rs, 1)])); } } } } }
static void t1bv_5(R *ri, R *ii, const R *W, stride rs, INT mb, INT me, INT ms) { DVK(KP250000000, +0.250000000000000000000000000000000000000000000); DVK(KP559016994, +0.559016994374947424102293417182819058860154590); DVK(KP587785252, +0.587785252292473129168705954639072768597652438); DVK(KP951056516, +0.951056516295153572116439333379382143405698634); INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 8)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 8), MAKE_VOLATILE_STRIDE(rs)) { V Tf, T5, Ta, Tc, Td, Tg; Tf = LD(&(x[0]), ms, &(x[0])); { V T2, T9, T4, T7; { V T1, T8, T3, T6; T1 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T2 = BYTW(&(W[0]), T1); T8 = LD(&(x[WS(rs, 3)]), ms, &(x[WS(rs, 1)])); T9 = BYTW(&(W[TWVL * 4]), T8); T3 = LD(&(x[WS(rs, 4)]), ms, &(x[0])); T4 = BYTW(&(W[TWVL * 6]), T3); T6 = LD(&(x[WS(rs, 2)]), ms, &(x[0])); T7 = BYTW(&(W[TWVL * 2]), T6); } T5 = VSUB(T2, T4); Ta = VSUB(T7, T9); Tc = VADD(T2, T4); Td = VADD(T7, T9); Tg = VADD(Tc, Td); } ST(&(x[0]), VADD(Tf, Tg), ms, &(x[0])); { V Tb, Tj, Ti, Tk, Te, Th; Tb = VBYI(VFMA(LDK(KP951056516), T5, VMUL(LDK(KP587785252), Ta))); Tj = VBYI(VFNMS(LDK(KP951056516), Ta, VMUL(LDK(KP587785252), T5))); Te = VMUL(LDK(KP559016994), VSUB(Tc, Td)); Th = VFNMS(LDK(KP250000000), Tg, Tf); Ti = VADD(Te, Th); Tk = VSUB(Th, Te); ST(&(x[WS(rs, 1)]), VADD(Tb, Ti), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 3)]), VSUB(Tk, Tj), ms, &(x[WS(rs, 1)])); ST(&(x[WS(rs, 4)]), VSUB(Ti, Tb), ms, &(x[0])); ST(&(x[WS(rs, 2)]), VADD(Tj, Tk), ms, &(x[0])); } } }
static void r2cf_64(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP773010453, +0.773010453362736960810906609758469800971041293); DK(KP634393284, +0.634393284163645498215171613225493370675687095); DK(KP098017140, +0.098017140329560601994195563888641845861136673); DK(KP995184726, +0.995184726672196886244836953109479921575474869); DK(KP290284677, +0.290284677254462367636192375817395274691476278); DK(KP956940335, +0.956940335732208864935797886980269969482849206); DK(KP471396736, +0.471396736825997648556387625905254377657460319); DK(KP881921264, +0.881921264348355029712756863660388349508442621); DK(KP195090322, +0.195090322016128267848284868477022240927691618); DK(KP980785280, +0.980785280403230449126182236134239036973933731); DK(KP555570233, +0.555570233019602224742830813948532874374937191); DK(KP831469612, +0.831469612302545237078788377617905756738560812); DK(KP382683432, +0.382683432365089771728459984030398866761344562); DK(KP923879532, +0.923879532511286756128183189396788286822416626); DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ivs, R1 = R1 + ivs, Cr = Cr + ovs, Ci = Ci + ovs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T4l, T5a, T15, T3n, T2T, T3Q, T7, Te, Tf, T4A, T4L, T1X, T3B, T23, T3y; E T5I, T66, T4R, T52, T2j, T3F, T2H, T3I, T5P, T69, T1i, T3t, T1l, T3u, TZ; E T63, T4v, T58, T1r, T3r, T1u, T3q, TK, T62, T4s, T57, Tm, Tt, Tu, T4o; E T5b, T1c, T3R, T2Q, T3o, T1M, T3z, T5L, T67, T26, T3C, T4H, T4M, T2y, T3J; E T5S, T6a, T2C, T3G, T4Y, T53; { E T3, T11, Td, T13, T6, T2S, Ta, T12, T14, T2R; { E T1, T2, Tb, Tc; T1 = R0[0]; T2 = R0[WS(rs, 16)]; T3 = T1 + T2; T11 = T1 - T2; Tb = R0[WS(rs, 28)]; Tc = R0[WS(rs, 12)]; Td = Tb + Tc; T13 = Tb - Tc; } { E T4, T5, T8, T9; T4 = R0[WS(rs, 8)]; T5 = R0[WS(rs, 24)]; T6 = T4 + T5; T2S = T4 - T5; T8 = R0[WS(rs, 4)]; T9 = R0[WS(rs, 20)]; Ta = T8 + T9; T12 = T8 - T9; } T4l = T3 - T6; T5a = Td - Ta; T14 = KP707106781 * (T12 + T13); T15 = T11 + T14; T3n = T11 - T14; T2R = KP707106781 * (T13 - T12); T2T = T2R - T2S; T3Q = T2S + T2R; T7 = T3 + T6; Te = Ta + Td; Tf = T7 + Te; } { E T1P, T4J, T21, T4y, T1S, T4K, T1W, T4z; { E T1N, T1O, T1Z, T20; T1N = R1[WS(rs, 28)]; T1O = R1[WS(rs, 12)]; T1P = T1N - T1O; T4J = T1N + T1O; T1Z = R1[0]; T20 = R1[WS(rs, 16)]; T21 = T1Z - T20; T4y = T1Z + T20; } { E T1Q, T1R, T1U, T1V; T1Q = R1[WS(rs, 4)]; T1R = R1[WS(rs, 20)]; T1S = T1Q - T1R; T4K = T1Q + T1R; T1U = R1[WS(rs, 8)]; T1V = R1[WS(rs, 24)]; T1W = T1U - T1V; T4z = T1U + T1V; } T4A = T4y - T4z; T4L = T4J - T4K; { E T1T, T22, T5G, T5H; T1T = KP707106781 * (T1P - T1S); T1X = T1T - T1W; T3B = T1W + T1T; T22 = KP707106781 * (T1S + T1P); T23 = T21 + T22; T3y = T21 - T22; T5G = T4y + T4z; T5H = T4K + T4J; T5I = T5G + T5H; T66 = T5G - T5H; } } { E T2b, T4P, T2G, T4Q, T2e, T51, T2h, T50; { E T29, T2a, T2E, T2F; T29 = R1[WS(rs, 31)]; T2a = R1[WS(rs, 15)]; T2b = T29 - T2a; T4P = T29 + T2a; T2E = R1[WS(rs, 7)]; T2F = R1[WS(rs, 23)]; T2G = T2E - T2F; T4Q = T2E + T2F; } { E T2c, T2d, T2f, T2g; T2c = R1[WS(rs, 3)]; T2d = R1[WS(rs, 19)]; T2e = T2c - T2d; T51 = T2c + T2d; T2f = R1[WS(rs, 27)]; T2g = R1[WS(rs, 11)]; T2h = T2f - T2g; T50 = T2f + T2g; } T4R = T4P - T4Q; T52 = T50 - T51; { E T2i, T2D, T5N, T5O; T2i = KP707106781 * (T2e + T2h); T2j = T2b + T2i; T3F = T2b - T2i; T2D = KP707106781 * (T2h - T2e); T2H = T2D - T2G; T3I = T2G + T2D; T5N = T4P + T4Q; T5O = T51 + T50; T5P = T5N + T5O; T69 = T5N - T5O; } } { E TN, T1e, TX, T1g, TQ, T1k, TU, T1f, T1h, T1j; { E TL, TM, TV, TW; TL = R0[WS(rs, 31)]; TM = R0[WS(rs, 15)]; TN = TL + TM; T1e = TL - TM; TV = R0[WS(rs, 27)]; TW = R0[WS(rs, 11)]; TX = TV + TW; T1g = TV - TW; } { E TO, TP, TS, TT; TO = R0[WS(rs, 7)]; TP = R0[WS(rs, 23)]; TQ = TO + TP; T1k = TO - TP; TS = R0[WS(rs, 3)]; TT = R0[WS(rs, 19)]; TU = TS + TT; T1f = TS - TT; } T1h = KP707106781 * (T1f + T1g); T1i = T1e + T1h; T3t = T1e - T1h; T1j = KP707106781 * (T1g - T1f); T1l = T1j - T1k; T3u = T1k + T1j; { E TR, TY, T4t, T4u; TR = TN + TQ; TY = TU + TX; TZ = TR + TY; T63 = TR - TY; T4t = TN - TQ; T4u = TX - TU; T4v = FNMS(KP382683432, T4u, KP923879532 * T4t); T58 = FMA(KP382683432, T4t, KP923879532 * T4u); } } { E Ty, T1s, TI, T1n, TB, T1q, TF, T1o, T1p, T1t; { E Tw, Tx, TG, TH; Tw = R0[WS(rs, 1)]; Tx = R0[WS(rs, 17)]; Ty = Tw + Tx; T1s = Tw - Tx; TG = R0[WS(rs, 29)]; TH = R0[WS(rs, 13)]; TI = TG + TH; T1n = TG - TH; } { E Tz, TA, TD, TE; Tz = R0[WS(rs, 9)]; TA = R0[WS(rs, 25)]; TB = Tz + TA; T1q = Tz - TA; TD = R0[WS(rs, 5)]; TE = R0[WS(rs, 21)]; TF = TD + TE; T1o = TD - TE; } T1p = KP707106781 * (T1n - T1o); T1r = T1p - T1q; T3r = T1q + T1p; T1t = KP707106781 * (T1o + T1n); T1u = T1s + T1t; T3q = T1s - T1t; { E TC, TJ, T4q, T4r; TC = Ty + TB; TJ = TF + TI; TK = TC + TJ; T62 = TC - TJ; T4q = Ty - TB; T4r = TI - TF; T4s = FMA(KP923879532, T4q, KP382683432 * T4r); T57 = FNMS(KP382683432, T4q, KP923879532 * T4r); } } { E Ti, T16, Ts, T1a, Tl, T17, Tp, T19, T4m, T4n; { E Tg, Th, Tq, Tr; Tg = R0[WS(rs, 2)]; Th = R0[WS(rs, 18)]; Ti = Tg + Th; T16 = Tg - Th; Tq = R0[WS(rs, 6)]; Tr = R0[WS(rs, 22)]; Ts = Tq + Tr; T1a = Tq - Tr; } { E Tj, Tk, Tn, To; Tj = R0[WS(rs, 10)]; Tk = R0[WS(rs, 26)]; Tl = Tj + Tk; T17 = Tj - Tk; Tn = R0[WS(rs, 30)]; To = R0[WS(rs, 14)]; Tp = Tn + To; T19 = Tn - To; } Tm = Ti + Tl; Tt = Tp + Ts; Tu = Tm + Tt; T4m = Ti - Tl; T4n = Tp - Ts; T4o = KP707106781 * (T4m + T4n); T5b = KP707106781 * (T4n - T4m); { E T18, T1b, T2O, T2P; T18 = FNMS(KP382683432, T17, KP923879532 * T16); T1b = FMA(KP923879532, T19, KP382683432 * T1a); T1c = T18 + T1b; T3R = T1b - T18; T2O = FNMS(KP923879532, T1a, KP382683432 * T19); T2P = FMA(KP382683432, T16, KP923879532 * T17); T2Q = T2O - T2P; T3o = T2P + T2O; } } { E T1A, T4E, T1K, T4C, T1D, T4F, T1H, T4B; { E T1y, T1z, T1I, T1J; T1y = R1[WS(rs, 30)]; T1z = R1[WS(rs, 14)]; T1A = T1y - T1z; T4E = T1y + T1z; T1I = R1[WS(rs, 10)]; T1J = R1[WS(rs, 26)]; T1K = T1I - T1J; T4C = T1I + T1J; } { E T1B, T1C, T1F, T1G; T1B = R1[WS(rs, 6)]; T1C = R1[WS(rs, 22)]; T1D = T1B - T1C; T4F = T1B + T1C; T1F = R1[WS(rs, 2)]; T1G = R1[WS(rs, 18)]; T1H = T1F - T1G; T4B = T1F + T1G; } { E T1E, T1L, T5J, T5K; T1E = FNMS(KP923879532, T1D, KP382683432 * T1A); T1L = FMA(KP382683432, T1H, KP923879532 * T1K); T1M = T1E - T1L; T3z = T1L + T1E; T5J = T4B + T4C; T5K = T4E + T4F; T5L = T5J + T5K; T67 = T5K - T5J; } { E T24, T25, T4D, T4G; T24 = FNMS(KP382683432, T1K, KP923879532 * T1H); T25 = FMA(KP923879532, T1A, KP382683432 * T1D); T26 = T24 + T25; T3C = T25 - T24; T4D = T4B - T4C; T4G = T4E - T4F; T4H = KP707106781 * (T4D + T4G); T4M = KP707106781 * (T4G - T4D); } } { E T2m, T4S, T2w, T4W, T2p, T4T, T2t, T4V; { E T2k, T2l, T2u, T2v; T2k = R1[WS(rs, 1)]; T2l = R1[WS(rs, 17)]; T2m = T2k - T2l; T4S = T2k + T2l; T2u = R1[WS(rs, 5)]; T2v = R1[WS(rs, 21)]; T2w = T2u - T2v; T4W = T2u + T2v; } { E T2n, T2o, T2r, T2s; T2n = R1[WS(rs, 9)]; T2o = R1[WS(rs, 25)]; T2p = T2n - T2o; T4T = T2n + T2o; T2r = R1[WS(rs, 29)]; T2s = R1[WS(rs, 13)]; T2t = T2r - T2s; T4V = T2r + T2s; } { E T2q, T2x, T5Q, T5R; T2q = FNMS(KP382683432, T2p, KP923879532 * T2m); T2x = FMA(KP923879532, T2t, KP382683432 * T2w); T2y = T2q + T2x; T3J = T2x - T2q; T5Q = T4S + T4T; T5R = T4V + T4W; T5S = T5Q + T5R; T6a = T5R - T5Q; } { E T2A, T2B, T4U, T4X; T2A = FNMS(KP923879532, T2w, KP382683432 * T2t); T2B = FMA(KP382683432, T2m, KP923879532 * T2p); T2C = T2A - T2B; T3G = T2B + T2A; T4U = T4S - T4T; T4X = T4V - T4W; T4Y = KP707106781 * (T4U + T4X); T53 = KP707106781 * (T4X - T4U); } } { E Tv, T10, T5X, T5Y, T5Z, T60; Tv = Tf + Tu; T10 = TK + TZ; T5X = Tv + T10; T5Y = T5I + T5L; T5Z = T5P + T5S; T60 = T5Y + T5Z; Cr[WS(csr, 16)] = Tv - T10; Ci[WS(csi, 16)] = T5Z - T5Y; Cr[WS(csr, 32)] = T5X - T60; Cr[0] = T5X + T60; } { E T5F, T5V, T5U, T5W, T5M, T5T; T5F = Tf - Tu; T5V = TZ - TK; T5M = T5I - T5L; T5T = T5P - T5S; T5U = KP707106781 * (T5M + T5T); T5W = KP707106781 * (T5T - T5M); Cr[WS(csr, 24)] = T5F - T5U; Ci[WS(csi, 24)] = T5W - T5V; Cr[WS(csr, 8)] = T5F + T5U; Ci[WS(csi, 8)] = T5V + T5W; } { E T65, T6l, T6k, T6m, T6c, T6g, T6f, T6h; { E T61, T64, T6i, T6j; T61 = T7 - Te; T64 = KP707106781 * (T62 + T63); T65 = T61 + T64; T6l = T61 - T64; T6i = FNMS(KP382683432, T66, KP923879532 * T67); T6j = FMA(KP382683432, T69, KP923879532 * T6a); T6k = T6i + T6j; T6m = T6j - T6i; } { E T68, T6b, T6d, T6e; T68 = FMA(KP923879532, T66, KP382683432 * T67); T6b = FNMS(KP382683432, T6a, KP923879532 * T69); T6c = T68 + T6b; T6g = T6b - T68; T6d = KP707106781 * (T63 - T62); T6e = Tt - Tm; T6f = T6d - T6e; T6h = T6e + T6d; } Cr[WS(csr, 28)] = T65 - T6c; Ci[WS(csi, 28)] = T6k - T6h; Cr[WS(csr, 4)] = T65 + T6c; Ci[WS(csi, 4)] = T6h + T6k; Ci[WS(csi, 12)] = T6f + T6g; Cr[WS(csr, 12)] = T6l + T6m; Ci[WS(csi, 20)] = T6g - T6f; Cr[WS(csr, 20)] = T6l - T6m; } { E T5n, T5D, T5x, T5z, T5q, T5A, T5t, T5B; { E T5l, T5m, T5v, T5w; T5l = T4l - T4o; T5m = T58 - T57; T5n = T5l + T5m; T5D = T5l - T5m; T5v = T4v - T4s; T5w = T5b - T5a; T5x = T5v - T5w; T5z = T5w + T5v; } { E T5o, T5p, T5r, T5s; T5o = T4A - T4H; T5p = T4M - T4L; T5q = FMA(KP831469612, T5o, KP555570233 * T5p); T5A = FNMS(KP555570233, T5o, KP831469612 * T5p); T5r = T4R - T4Y; T5s = T53 - T52; T5t = FNMS(KP555570233, T5s, KP831469612 * T5r); T5B = FMA(KP555570233, T5r, KP831469612 * T5s); } { E T5u, T5C, T5y, T5E; T5u = T5q + T5t; Cr[WS(csr, 26)] = T5n - T5u; Cr[WS(csr, 6)] = T5n + T5u; T5C = T5A + T5B; Ci[WS(csi, 6)] = T5z + T5C; Ci[WS(csi, 26)] = T5C - T5z; T5y = T5t - T5q; Ci[WS(csi, 10)] = T5x + T5y; Ci[WS(csi, 22)] = T5y - T5x; T5E = T5B - T5A; Cr[WS(csr, 22)] = T5D - T5E; Cr[WS(csr, 10)] = T5D + T5E; } } { E T4x, T5j, T5d, T5f, T4O, T5g, T55, T5h; { E T4p, T4w, T59, T5c; T4p = T4l + T4o; T4w = T4s + T4v; T4x = T4p + T4w; T5j = T4p - T4w; T59 = T57 + T58; T5c = T5a + T5b; T5d = T59 - T5c; T5f = T5c + T59; } { E T4I, T4N, T4Z, T54; T4I = T4A + T4H; T4N = T4L + T4M; T4O = FMA(KP980785280, T4I, KP195090322 * T4N); T5g = FNMS(KP195090322, T4I, KP980785280 * T4N); T4Z = T4R + T4Y; T54 = T52 + T53; T55 = FNMS(KP195090322, T54, KP980785280 * T4Z); T5h = FMA(KP195090322, T4Z, KP980785280 * T54); } { E T56, T5i, T5e, T5k; T56 = T4O + T55; Cr[WS(csr, 30)] = T4x - T56; Cr[WS(csr, 2)] = T4x + T56; T5i = T5g + T5h; Ci[WS(csi, 2)] = T5f + T5i; Ci[WS(csi, 30)] = T5i - T5f; T5e = T55 - T4O; Ci[WS(csi, 14)] = T5d + T5e; Ci[WS(csi, 18)] = T5e - T5d; T5k = T5h - T5g; Cr[WS(csr, 18)] = T5j - T5k; Cr[WS(csr, 14)] = T5j + T5k; } } { E T3p, T41, T4c, T3S, T3w, T4b, T49, T4h, T3P, T42, T3E, T3W, T46, T4g, T3L; E T3X; { E T3s, T3v, T3A, T3D; T3p = T3n + T3o; T41 = T3n - T3o; T4c = T3R - T3Q; T3S = T3Q + T3R; T3s = FMA(KP831469612, T3q, KP555570233 * T3r); T3v = FNMS(KP555570233, T3u, KP831469612 * T3t); T3w = T3s + T3v; T4b = T3v - T3s; { E T47, T48, T3N, T3O; T47 = T3F - T3G; T48 = T3J - T3I; T49 = FNMS(KP471396736, T48, KP881921264 * T47); T4h = FMA(KP471396736, T47, KP881921264 * T48); T3N = FNMS(KP555570233, T3q, KP831469612 * T3r); T3O = FMA(KP555570233, T3t, KP831469612 * T3u); T3P = T3N + T3O; T42 = T3O - T3N; } T3A = T3y + T3z; T3D = T3B + T3C; T3E = FMA(KP956940335, T3A, KP290284677 * T3D); T3W = FNMS(KP290284677, T3A, KP956940335 * T3D); { E T44, T45, T3H, T3K; T44 = T3y - T3z; T45 = T3C - T3B; T46 = FMA(KP881921264, T44, KP471396736 * T45); T4g = FNMS(KP471396736, T44, KP881921264 * T45); T3H = T3F + T3G; T3K = T3I + T3J; T3L = FNMS(KP290284677, T3K, KP956940335 * T3H); T3X = FMA(KP290284677, T3H, KP956940335 * T3K); } } { E T3x, T3M, T3V, T3Y; T3x = T3p + T3w; T3M = T3E + T3L; Cr[WS(csr, 29)] = T3x - T3M; Cr[WS(csr, 3)] = T3x + T3M; T3V = T3S + T3P; T3Y = T3W + T3X; Ci[WS(csi, 3)] = T3V + T3Y; Ci[WS(csi, 29)] = T3Y - T3V; } { E T3T, T3U, T3Z, T40; T3T = T3P - T3S; T3U = T3L - T3E; Ci[WS(csi, 13)] = T3T + T3U; Ci[WS(csi, 19)] = T3U - T3T; T3Z = T3p - T3w; T40 = T3X - T3W; Cr[WS(csr, 19)] = T3Z - T40; Cr[WS(csr, 13)] = T3Z + T40; } { E T43, T4a, T4f, T4i; T43 = T41 + T42; T4a = T46 + T49; Cr[WS(csr, 27)] = T43 - T4a; Cr[WS(csr, 5)] = T43 + T4a; T4f = T4c + T4b; T4i = T4g + T4h; Ci[WS(csi, 5)] = T4f + T4i; Ci[WS(csi, 27)] = T4i - T4f; } { E T4d, T4e, T4j, T4k; T4d = T4b - T4c; T4e = T49 - T46; Ci[WS(csi, 11)] = T4d + T4e; Ci[WS(csi, 21)] = T4e - T4d; T4j = T41 - T42; T4k = T4h - T4g; Cr[WS(csr, 21)] = T4j - T4k; Cr[WS(csr, 11)] = T4j + T4k; } } { E T1d, T33, T3e, T2U, T1w, T3d, T3b, T3j, T2N, T34, T28, T2Y, T38, T3i, T2J; E T2Z; { E T1m, T1v, T1Y, T27; T1d = T15 - T1c; T33 = T15 + T1c; T3e = T2T + T2Q; T2U = T2Q - T2T; T1m = FMA(KP195090322, T1i, KP980785280 * T1l); T1v = FNMS(KP195090322, T1u, KP980785280 * T1r); T1w = T1m - T1v; T3d = T1v + T1m; { E T39, T3a, T2L, T2M; T39 = T2j + T2y; T3a = T2H + T2C; T3b = FNMS(KP098017140, T3a, KP995184726 * T39); T3j = FMA(KP995184726, T3a, KP098017140 * T39); T2L = FNMS(KP195090322, T1l, KP980785280 * T1i); T2M = FMA(KP980785280, T1u, KP195090322 * T1r); T2N = T2L - T2M; T34 = T2M + T2L; } T1Y = T1M - T1X; T27 = T23 - T26; T28 = FMA(KP634393284, T1Y, KP773010453 * T27); T2Y = FNMS(KP634393284, T27, KP773010453 * T1Y); { E T36, T37, T2z, T2I; T36 = T1X + T1M; T37 = T23 + T26; T38 = FMA(KP098017140, T36, KP995184726 * T37); T3i = FNMS(KP098017140, T37, KP995184726 * T36); T2z = T2j - T2y; T2I = T2C - T2H; T2J = FNMS(KP634393284, T2I, KP773010453 * T2z); T2Z = FMA(KP773010453, T2I, KP634393284 * T2z); } } { E T1x, T2K, T2X, T30; T1x = T1d + T1w; T2K = T28 + T2J; Cr[WS(csr, 25)] = T1x - T2K; Cr[WS(csr, 7)] = T1x + T2K; T2X = T2U + T2N; T30 = T2Y + T2Z; Ci[WS(csi, 7)] = T2X + T30; Ci[WS(csi, 25)] = T30 - T2X; } { E T2V, T2W, T31, T32; T2V = T2N - T2U; T2W = T2J - T28; Ci[WS(csi, 9)] = T2V + T2W; Ci[WS(csi, 23)] = T2W - T2V; T31 = T1d - T1w; T32 = T2Z - T2Y; Cr[WS(csr, 23)] = T31 - T32; Cr[WS(csr, 9)] = T31 + T32; } { E T35, T3c, T3h, T3k; T35 = T33 + T34; T3c = T38 + T3b; Cr[WS(csr, 31)] = T35 - T3c; Cr[WS(csr, 1)] = T35 + T3c; T3h = T3e + T3d; T3k = T3i + T3j; Ci[WS(csi, 1)] = T3h + T3k; Ci[WS(csi, 31)] = T3k - T3h; } { E T3f, T3g, T3l, T3m; T3f = T3d - T3e; T3g = T3b - T38; Ci[WS(csi, 15)] = T3f + T3g; Ci[WS(csi, 17)] = T3g - T3f; T3l = T33 - T34; T3m = T3j - T3i; Cr[WS(csr, 17)] = T3l - T3m; Cr[WS(csr, 15)] = T3l + T3m; } } } }
static void r2cb_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_801937735, +1.801937735804838252472204639014890102331838324); DK(KP445041867, +0.445041867912628808577805128993589518932711138); DK(KP1_246979603, +1.246979603717467061050009768008479621264549462); DK(KP867767478, +0.867767478235116240951536665696717509219981456); DK(KP1_949855824, +1.949855824363647214036263365987862434465571601); DK(KP1_563662964, +1.563662964936059617416889053348115500464669037); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); { INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) { E T3, Td, T6, Te, Tq, Tz, Tn, Ty, Tc, Tg, Tk, Tx, T9, Tf, T1; E T2; T1 = Cr[0]; T2 = Cr[WS(csr, 7)]; T3 = T1 - T2; Td = T1 + T2; { E T4, T5, To, Tp; T4 = Cr[WS(csr, 2)]; T5 = Cr[WS(csr, 5)]; T6 = T4 - T5; Te = T4 + T5; To = Ci[WS(csi, 2)]; Tp = Ci[WS(csi, 5)]; Tq = To - Tp; Tz = To + Tp; } { E Tl, Tm, Ta, Tb; Tl = Ci[WS(csi, 6)]; Tm = Ci[WS(csi, 1)]; Tn = Tl - Tm; Ty = Tl + Tm; Ta = Cr[WS(csr, 6)]; Tb = Cr[WS(csr, 1)]; Tc = Ta - Tb; Tg = Ta + Tb; } { E Ti, Tj, T7, T8; Ti = Ci[WS(csi, 4)]; Tj = Ci[WS(csi, 3)]; Tk = Ti - Tj; Tx = Ti + Tj; T7 = Cr[WS(csr, 4)]; T8 = Cr[WS(csr, 3)]; T9 = T7 - T8; Tf = T7 + T8; } R1[WS(rs, 3)] = FMA(KP2_000000000, T6 + T9 + Tc, T3); R0[0] = FMA(KP2_000000000, Te + Tf + Tg, Td); { E Tr, Th, TE, TD; Tr = FNMS(KP1_949855824, Tn, KP1_563662964 * Tk) - (KP867767478 * Tq); Th = FMA(KP1_246979603, Tf, Td) + FNMA(KP445041867, Tg, KP1_801937735 * Te); R0[WS(rs, 2)] = Th - Tr; R0[WS(rs, 5)] = Th + Tr; TE = FMA(KP867767478, Tx, KP1_563662964 * Ty) - (KP1_949855824 * Tz); TD = FMA(KP1_246979603, Tc, T3) + FNMA(KP1_801937735, T9, KP445041867 * T6); R1[WS(rs, 2)] = TD - TE; R1[WS(rs, 4)] = TD + TE; } { E Tt, Ts, TA, Tw; Tt = FMA(KP867767478, Tk, KP1_563662964 * Tn) - (KP1_949855824 * Tq); Ts = FMA(KP1_246979603, Tg, Td) + FNMA(KP1_801937735, Tf, KP445041867 * Te); R0[WS(rs, 6)] = Ts - Tt; R0[WS(rs, 1)] = Ts + Tt; TA = FNMS(KP1_949855824, Ty, KP1_563662964 * Tx) - (KP867767478 * Tz); Tw = FMA(KP1_246979603, T9, T3) + FNMA(KP445041867, Tc, KP1_801937735 * T6); R1[WS(rs, 5)] = Tw - TA; R1[WS(rs, 1)] = Tw + TA; } { E TC, TB, Tv, Tu; TC = FMA(KP1_563662964, Tz, KP1_949855824 * Tx) + (KP867767478 * Ty); TB = FMA(KP1_246979603, T6, T3) + FNMA(KP1_801937735, Tc, KP445041867 * T9); R1[0] = TB - TC; R1[WS(rs, 6)] = TB + TC; Tv = FMA(KP1_563662964, Tq, KP1_949855824 * Tk) + (KP867767478 * Tn); Tu = FMA(KP1_246979603, Te, Td) + FNMA(KP1_801937735, Tg, KP445041867 * Tf); R0[WS(rs, 4)] = Tu - Tv; R0[WS(rs, 3)] = Tu + Tv; } } } }
static void n1fv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DVK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DVK(KP083333333, +0.083333333333333333333333333333333333333333333); DVK(KP075902986, +0.075902986037193865983102897245103540356428373); DVK(KP251768516, +0.251768516431883313623436926934233488546674281); DVK(KP132983124, +0.132983124607418643793760531921092974399165133); DVK(KP258260390, +0.258260390311744861420450644284508567852516811); DVK(KP1_732050807, +1.732050807568877293527446341505872366942805254); DVK(KP300238635, +0.300238635966332641462884626667381504676006424); DVK(KP011599105, +0.011599105605768290721655456654083252189827041); DVK(KP156891391, +0.156891391051584611046832726756003269660212636); DVK(KP256247671, +0.256247671582936600958684654061725059144125175); DVK(KP174138601, +0.174138601152135905005660794929264742616964676); DVK(KP575140729, +0.575140729474003121368385547455453388461001608); DVK(KP503537032, +0.503537032863766627246873853868466977093348562); DVK(KP113854479, +0.113854479055790798974654345867655310534642560); DVK(KP265966249, +0.265966249214837287587521063842185948798330267); DVK(KP387390585, +0.387390585467617292130675966426762851778775217); DVK(KP300462606, +0.300462606288665774426601772289207995520941381); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT i; const R *xi; R *xo; xi = ri; xo = ro; for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(26, is), MAKE_VOLATILE_STRIDE(26, os)) { V TW, Tb, Tm, Tu, TC, TR, TX, TK, TU, Tz, TB, TN, TT; TW = LD(&(xi[0]), ivs, &(xi[0])); { V T3, TH, Tl, Tw, Tp, Tg, Tv, To, T6, Tr, T9, Ts, Ta, TI, T1; V T2, Tq, Tt; T1 = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T3 = VSUB(T1, T2); TH = VADD(T1, T2); { V Th, Ti, Tj, Tk; Th = LD(&(xi[WS(is, 12)]), ivs, &(xi[0])); Ti = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); Tj = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Tk = VADD(Ti, Tj); Tl = VADD(Th, Tk); Tw = VSUB(Ti, Tj); Tp = VFNMS(LDK(KP500000000), Tk, Th); } { V Tc, Td, Te, Tf; Tc = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); Td = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); Te = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Tf = VADD(Td, Te); Tg = VADD(Tc, Tf); Tv = VSUB(Td, Te); To = VFNMS(LDK(KP500000000), Tf, Tc); } { V T4, T5, T7, T8; T4 = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T6 = VSUB(T4, T5); Tr = VADD(T4, T5); T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T8 = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); T9 = VSUB(T7, T8); Ts = VADD(T7, T8); } Ta = VADD(T6, T9); TI = VADD(Tr, Ts); Tb = VADD(T3, Ta); Tm = VSUB(Tg, Tl); Tq = VSUB(To, Tp); Tt = VMUL(LDK(KP866025403), VSUB(Tr, Ts)); Tu = VADD(Tq, Tt); TC = VSUB(Tq, Tt); { V TP, TQ, TG, TJ; TP = VADD(Tg, Tl); TQ = VADD(TH, TI); TR = VMUL(LDK(KP300462606), VSUB(TP, TQ)); TX = VADD(TP, TQ); TG = VADD(To, Tp); TJ = VFNMS(LDK(KP500000000), TI, TH); TK = VSUB(TG, TJ); TU = VADD(TG, TJ); } { V Tx, Ty, TL, TM; Tx = VMUL(LDK(KP866025403), VSUB(Tv, Tw)); Ty = VFNMS(LDK(KP500000000), Ta, T3); Tz = VSUB(Tx, Ty); TB = VADD(Tx, Ty); TL = VADD(Tv, Tw); TM = VSUB(T6, T9); TN = VSUB(TL, TM); TT = VADD(TL, TM); } } ST(&(xo[0]), VADD(TW, TX), ovs, &(xo[0])); { V T19, T1n, T14, T13, T1f, T1k, Tn, TE, T1e, T1j, TS, T1m, TZ, T1c, TA; V TD; { V T17, T18, T11, T12; T17 = VFMA(LDK(KP387390585), TN, VMUL(LDK(KP265966249), TK)); T18 = VFNMS(LDK(KP503537032), TU, VMUL(LDK(KP113854479), TT)); T19 = VSUB(T17, T18); T1n = VADD(T17, T18); T14 = VFMA(LDK(KP575140729), Tm, VMUL(LDK(KP174138601), Tb)); T11 = VFNMS(LDK(KP156891391), TB, VMUL(LDK(KP256247671), TC)); T12 = VFMA(LDK(KP011599105), Tz, VMUL(LDK(KP300238635), Tu)); T13 = VSUB(T11, T12); T1f = VADD(T14, T13); T1k = VMUL(LDK(KP1_732050807), VADD(T11, T12)); } Tn = VFNMS(LDK(KP174138601), Tm, VMUL(LDK(KP575140729), Tb)); TA = VFNMS(LDK(KP300238635), Tz, VMUL(LDK(KP011599105), Tu)); TD = VFMA(LDK(KP256247671), TB, VMUL(LDK(KP156891391), TC)); TE = VSUB(TA, TD); T1e = VMUL(LDK(KP1_732050807), VADD(TD, TA)); T1j = VSUB(Tn, TE); { V TO, T1b, TV, TY, T1a; TO = VFNMS(LDK(KP132983124), TN, VMUL(LDK(KP258260390), TK)); T1b = VSUB(TR, TO); TV = VFMA(LDK(KP251768516), TT, VMUL(LDK(KP075902986), TU)); TY = VFNMS(LDK(KP083333333), TX, TW); T1a = VSUB(TY, TV); TS = VFMA(LDK(KP2_000000000), TO, TR); T1m = VADD(T1b, T1a); TZ = VFMA(LDK(KP2_000000000), TV, TY); T1c = VSUB(T1a, T1b); } { V TF, T10, T1l, T1o; TF = VBYI(VFMA(LDK(KP2_000000000), TE, Tn)); T10 = VADD(TS, TZ); ST(&(xo[WS(os, 1)]), VADD(TF, T10), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 12)]), VSUB(T10, TF), ovs, &(xo[0])); { V T15, T16, T1p, T1q; T15 = VBYI(VFMS(LDK(KP2_000000000), T13, T14)); T16 = VSUB(TZ, TS); ST(&(xo[WS(os, 5)]), VADD(T15, T16), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 8)]), VSUB(T16, T15), ovs, &(xo[0])); T1p = VADD(T1n, T1m); T1q = VBYI(VADD(T1j, T1k)); ST(&(xo[WS(os, 4)]), VSUB(T1p, T1q), ovs, &(xo[0])); ST(&(xo[WS(os, 9)]), VADD(T1q, T1p), ovs, &(xo[WS(os, 1)])); } T1l = VBYI(VSUB(T1j, T1k)); T1o = VSUB(T1m, T1n); ST(&(xo[WS(os, 3)]), VADD(T1l, T1o), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 10)]), VSUB(T1o, T1l), ovs, &(xo[0])); { V T1h, T1i, T1d, T1g; T1h = VBYI(VSUB(T1e, T1f)); T1i = VSUB(T1c, T19); ST(&(xo[WS(os, 6)]), VADD(T1h, T1i), ovs, &(xo[0])); ST(&(xo[WS(os, 7)]), VSUB(T1i, T1h), ovs, &(xo[WS(os, 1)])); T1d = VADD(T19, T1c); T1g = VBYI(VADD(T1e, T1f)); ST(&(xo[WS(os, 2)]), VSUB(T1d, T1g), ovs, &(xo[0])); ST(&(xo[WS(os, 11)]), VADD(T1g, T1d), ovs, &(xo[WS(os, 1)])); } } } } } VLEAVE(); }
static void r2cb_14(R *R0, R *R1, R *Cr, R *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_949855824, +1.949855824363647214036263365987862434465571601); DK(KP1_801937735, +1.801937735804838252472204639014890102331838324); DK(KP692021471, +0.692021471630095869627814897002069140197260599); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP356895867, +0.356895867892209443894399510021300583399127187); DK(KP801937735, +0.801937735804838252472204639014890102331838324); DK(KP554958132, +0.554958132087371191422194871006410481067288862); { INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(56, rs), MAKE_VOLATILE_STRIDE(56, csr), MAKE_VOLATILE_STRIDE(56, csi)) { E Te, TO, TT, TG, TJ, TD, TR, TE; { E T3, TK, To, TM, Tu, TL, Tr, TS, TA, TN, TX, TF, Tv, T7, Tf; E T6, Th, Tc, T8, T1, T2; T1 = Cr[0]; T2 = Cr[WS(csr, 7)]; { E Ts, Tt, Tp, Tq, Tm, Tn; Tm = Ci[WS(csi, 4)]; Tn = Ci[WS(csi, 3)]; Ts = Ci[WS(csi, 6)]; Te = T1 + T2; T3 = T1 - T2; TK = Tm + Tn; To = Tm - Tn; Tt = Ci[WS(csi, 1)]; Tp = Ci[WS(csi, 2)]; Tq = Ci[WS(csi, 5)]; { E T4, T5, Ta, Tb; T4 = Cr[WS(csr, 2)]; TM = Ts + Tt; Tu = Ts - Tt; TL = Tp + Tq; Tr = Tp - Tq; TS = FMA(KP554958132, TK, TM); TA = FMA(KP554958132, To, Tu); TN = FMA(KP554958132, TM, TL); TX = FNMS(KP554958132, TL, TK); TF = FNMS(KP554958132, Tr, To); Tv = FMA(KP554958132, Tu, Tr); T5 = Cr[WS(csr, 5)]; Ta = Cr[WS(csr, 6)]; Tb = Cr[WS(csr, 1)]; T7 = Cr[WS(csr, 4)]; Tf = T4 + T5; T6 = T4 - T5; Th = Ta + Tb; Tc = Ta - Tb; T8 = Cr[WS(csr, 3)]; } } { E Tw, Tx, TP, Tg, T9, TY, TC, TI, TQ; Tw = FMA(KP801937735, Tv, To); Tx = FNMS(KP356895867, Tf, Th); TP = FNMS(KP356895867, T6, Tc); Tg = T7 + T8; T9 = T7 - T8; TY = FNMS(KP801937735, TX, TM); { E TB, TH, TV, Ty, Tl, Ti, TW, Tz; TB = FNMS(KP801937735, TA, Tr); Ti = Tf + Tg + Th; TC = FNMS(KP356895867, Th, Tg); { E Tj, Td, TU, Tk; Tj = FNMS(KP356895867, Tg, Tf); Td = T6 + T9 + Tc; TH = FNMS(KP356895867, T9, T6); TU = FNMS(KP356895867, Tc, T9); R0[0] = FMA(KP2_000000000, Ti, Te); Tk = FNMS(KP692021471, Tj, Th); R1[WS(rs, 3)] = FMA(KP2_000000000, Td, T3); TV = FNMS(KP692021471, TU, T6); Ty = FNMS(KP692021471, Tx, Tg); Tl = FNMS(KP1_801937735, Tk, Te); } TO = FMA(KP801937735, TN, TK); TW = FNMS(KP1_801937735, TV, T3); Tz = FNMS(KP1_801937735, Ty, Te); R0[WS(rs, 3)] = FMA(KP1_949855824, Tw, Tl); R0[WS(rs, 4)] = FNMS(KP1_949855824, Tw, Tl); R1[WS(rs, 5)] = FMA(KP1_949855824, TY, TW); R1[WS(rs, 1)] = FNMS(KP1_949855824, TY, TW); R0[WS(rs, 6)] = FMA(KP1_949855824, TB, Tz); R0[WS(rs, 1)] = FNMS(KP1_949855824, TB, Tz); TI = FNMS(KP692021471, TH, Tc); } TT = FNMS(KP801937735, TS, TL); TQ = FNMS(KP692021471, TP, T9); TG = FNMS(KP801937735, TF, Tu); TJ = FNMS(KP1_801937735, TI, T3); TD = FNMS(KP692021471, TC, Tf); TR = FNMS(KP1_801937735, TQ, T3); } } R1[WS(rs, 6)] = FMA(KP1_949855824, TO, TJ); R1[0] = FNMS(KP1_949855824, TO, TJ); TE = FNMS(KP1_801937735, TD, Te); R1[WS(rs, 2)] = FMA(KP1_949855824, TT, TR); R1[WS(rs, 4)] = FNMS(KP1_949855824, TT, TR); R0[WS(rs, 2)] = FMA(KP1_949855824, TG, TE); R0[WS(rs, 5)] = FNMS(KP1_949855824, TG, TE); } } }
static void n1fv_13(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DVK(KP904176221, +0.904176221990848204433795481776887926501523162); DVK(KP575140729, +0.575140729474003121368385547455453388461001608); DVK(KP300462606, +0.300462606288665774426601772289207995520941381); DVK(KP516520780, +0.516520780623489722840901288569017135705033622); DVK(KP522026385, +0.522026385161275033714027226654165028300441940); DVK(KP957805992, +0.957805992594665126462521754605754580515587217); DVK(KP600477271, +0.600477271932665282925769253334763009352012849); DVK(KP251768516, +0.251768516431883313623436926934233488546674281); DVK(KP503537032, +0.503537032863766627246873853868466977093348562); DVK(KP769338817, +0.769338817572980603471413688209101117038278899); DVK(KP859542535, +0.859542535098774820163672132761689612766401925); DVK(KP581704778, +0.581704778510515730456870384989698884939833902); DVK(KP853480001, +0.853480001859823990758994934970528322872359049); DVK(KP083333333, +0.083333333333333333333333333333333333333333333); DVK(KP226109445, +0.226109445035782405468510155372505010481906348); DVK(KP301479260, +0.301479260047709873958013540496673347309208464); DVK(KP686558370, +0.686558370781754340655719594850823015421401653); DVK(KP514918778, +0.514918778086315755491789696138117261566051239); DVK(KP038632954, +0.038632954644348171955506895830342264440241080); DVK(KP612264650, +0.612264650376756543746494474777125408779395514); DVK(KP302775637, +0.302775637731994646559610633735247973125648287); DVK(KP866025403, +0.866025403784438646763723170752936183471402627); DVK(KP500000000, +0.500000000000000000000000000000000000000000000); { INT i; const R *xi; R *xo; xi = ri; xo = ro; for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(26, is), MAKE_VOLATILE_STRIDE(26, os)) { V T1, T7, T2, Tg, Tf, TN, Th, Tq, Ta, Tj, T5, Tr, Tk; T1 = LD(&(xi[0]), ivs, &(xi[0])); { V Td, Te, T8, T9, T3, T4; Td = LD(&(xi[WS(is, 8)]), ivs, &(xi[0])); Te = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T7 = LD(&(xi[WS(is, 12)]), ivs, &(xi[0])); T8 = LD(&(xi[WS(is, 10)]), ivs, &(xi[0])); T9 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T3 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); T4 = LD(&(xi[WS(is, 9)]), ivs, &(xi[WS(is, 1)])); Tg = LD(&(xi[WS(is, 11)]), ivs, &(xi[WS(is, 1)])); Tf = VADD(Td, Te); TN = VSUB(Td, Te); Th = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); Tq = VSUB(T8, T9); Ta = VADD(T8, T9); Tj = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T5 = VADD(T3, T4); Tr = VSUB(T4, T3); Tk = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); } { V Tt, Ti, Ty, Tb, Ts, TQ, Tx, T6, Tu, Tl; Tt = VSUB(Tg, Th); Ti = VADD(Tg, Th); Ty = VFMS(LDK(KP500000000), Ta, T7); Tb = VADD(T7, Ta); Ts = VSUB(Tq, Tr); TQ = VADD(Tr, Tq); Tx = VFNMS(LDK(KP500000000), T5, T2); T6 = VADD(T2, T5); Tu = VSUB(Tj, Tk); Tl = VADD(Tj, Tk); { V TK, Tz, Tc, TX, Tv, TO, TL, Tm; TK = VADD(Tx, Ty); Tz = VSUB(Tx, Ty); Tc = VADD(T6, Tb); TX = VSUB(T6, Tb); Tv = VSUB(Tt, Tu); TO = VADD(Tt, Tu); TL = VSUB(Ti, Tl); Tm = VADD(Ti, Tl); { V TF, Tw, TP, TY, TT, TM, TA, Tn; TF = VSUB(Ts, Tv); Tw = VADD(Ts, Tv); TP = VFNMS(LDK(KP500000000), TO, TN); TY = VADD(TN, TO); TT = VFNMS(LDK(KP866025403), TL, TK); TM = VFMA(LDK(KP866025403), TL, TK); TA = VFNMS(LDK(KP500000000), Tm, Tf); Tn = VADD(Tf, Tm); { V T1f, T1n, TI, T18, T1k, T1c, TD, T17, T10, T1m, T16, T1e, TU, TR; TU = VFNMS(LDK(KP866025403), TQ, TP); TR = VFMA(LDK(KP866025403), TQ, TP); { V TZ, T15, TE, TB; TZ = VFMA(LDK(KP302775637), TY, TX); T15 = VFNMS(LDK(KP302775637), TX, TY); TE = VSUB(Tz, TA); TB = VADD(Tz, TA); { V TH, To, TV, T13; TH = VSUB(Tc, Tn); To = VADD(Tc, Tn); TV = VFNMS(LDK(KP612264650), TU, TT); T13 = VFMA(LDK(KP612264650), TT, TU); { V TS, T12, TG, T1b; TS = VFNMS(LDK(KP038632954), TR, TM); T12 = VFMA(LDK(KP038632954), TM, TR); TG = VFNMS(LDK(KP514918778), TF, TE); T1b = VFMA(LDK(KP686558370), TE, TF); { V TC, T1a, Tp, TW, T14; TC = VFMA(LDK(KP301479260), TB, Tw); T1a = VFNMS(LDK(KP226109445), Tw, TB); Tp = VFNMS(LDK(KP083333333), To, T1); ST(&(xo[0]), VADD(T1, To), ovs, &(xo[0])); T1f = VFMA(LDK(KP853480001), TV, TS); TW = VFNMS(LDK(KP853480001), TV, TS); T1n = VFMA(LDK(KP853480001), T13, T12); T14 = VFNMS(LDK(KP853480001), T13, T12); TI = VFMA(LDK(KP581704778), TH, TG); T18 = VFNMS(LDK(KP859542535), TG, TH); T1k = VFMA(LDK(KP769338817), T1b, T1a); T1c = VFNMS(LDK(KP769338817), T1b, T1a); TD = VFMA(LDK(KP503537032), TC, Tp); T17 = VFNMS(LDK(KP251768516), TC, Tp); T10 = VMUL(LDK(KP600477271), VFMA(LDK(KP957805992), TZ, TW)); T1m = VFNMS(LDK(KP522026385), TW, TZ); T16 = VMUL(LDK(KP600477271), VFMA(LDK(KP957805992), T15, T14)); T1e = VFNMS(LDK(KP522026385), T14, T15); } } } } { V T1o, T1q, T1g, T1i, T1d, T1h, T1l, T1p; { V T11, TJ, T19, T1j; T11 = VFMA(LDK(KP516520780), TI, TD); TJ = VFNMS(LDK(KP516520780), TI, TD); T19 = VFMA(LDK(KP300462606), T18, T17); T1j = VFNMS(LDK(KP300462606), T18, T17); T1o = VMUL(LDK(KP575140729), VFNMS(LDK(KP904176221), T1n, T1m)); T1q = VMUL(LDK(KP575140729), VFMA(LDK(KP904176221), T1n, T1m)); T1g = VMUL(LDK(KP575140729), VFMA(LDK(KP904176221), T1f, T1e)); T1i = VMUL(LDK(KP575140729), VFNMS(LDK(KP904176221), T1f, T1e)); ST(&(xo[WS(os, 12)]), VFNMSI(T16, T11), ovs, &(xo[0])); ST(&(xo[WS(os, 1)]), VFMAI(T16, T11), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 8)]), VFMAI(T10, TJ), ovs, &(xo[0])); ST(&(xo[WS(os, 5)]), VFNMSI(T10, TJ), ovs, &(xo[WS(os, 1)])); T1d = VFNMS(LDK(KP503537032), T1c, T19); T1h = VFMA(LDK(KP503537032), T1c, T19); T1l = VFNMS(LDK(KP503537032), T1k, T1j); T1p = VFMA(LDK(KP503537032), T1k, T1j); } ST(&(xo[WS(os, 9)]), VFMAI(T1g, T1d), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 4)]), VFNMSI(T1g, T1d), ovs, &(xo[0])); ST(&(xo[WS(os, 10)]), VFNMSI(T1i, T1h), ovs, &(xo[0])); ST(&(xo[WS(os, 3)]), VFMAI(T1i, T1h), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 7)]), VFMAI(T1o, T1l), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 6)]), VFNMSI(T1o, T1l), ovs, &(xo[0])); ST(&(xo[WS(os, 11)]), VFMAI(T1q, T1p), ovs, &(xo[WS(os, 1)])); ST(&(xo[WS(os, 2)]), VFNMSI(T1q, T1p), ovs, &(xo[0])); } } } } } } } VLEAVE(); }
static void hc2cbdft_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT m; for (m = mb, W = W + ((mb - 1) * 14); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 14, MAKE_VOLATILE_STRIDE(rs)) { E T1m, T1r, T1i, T1u, T1o, T1v, T1n, T1w, T1s; { E T1k, Tl, T1p, TE, TP, T1g, TM, T1b, T1f, T1a, TU, Tf, T1l, TH, Tw; E T1q; { E TA, T3, TN, Tk, Th, T6, TO, TD, Tb, Tm, Ta, TK, Tp, Tc, Ts; E Tt; { E T4, T5, TB, TC; { E T1, T2, Ti, Tj; T1 = Rp[0]; T2 = Rm[WS(rs, 3)]; Ti = Ip[0]; Tj = Im[WS(rs, 3)]; T4 = Rp[WS(rs, 2)]; TA = T1 - T2; T3 = T1 + T2; TN = Ti - Tj; Tk = Ti + Tj; T5 = Rm[WS(rs, 1)]; TB = Ip[WS(rs, 2)]; TC = Im[WS(rs, 1)]; } { E T8, T9, Tn, To; T8 = Rp[WS(rs, 1)]; Th = T4 - T5; T6 = T4 + T5; TO = TB - TC; TD = TB + TC; T9 = Rm[WS(rs, 2)]; Tn = Ip[WS(rs, 1)]; To = Im[WS(rs, 2)]; Tb = Rm[0]; Tm = T8 - T9; Ta = T8 + T9; TK = Tn - To; Tp = Tn + To; Tc = Rp[WS(rs, 3)]; Ts = Im[0]; Tt = Ip[WS(rs, 3)]; } } { E Tr, Td, Tu, TL, Te, T7; T1k = Tk - Th; Tl = Th + Tk; Tr = Tb - Tc; Td = Tb + Tc; TL = Tt - Ts; Tu = Ts + Tt; T1p = TA + TD; TE = TA - TD; TP = TN + TO; T1g = TN - TO; TM = TK + TL; T1b = TL - TK; T1f = Ta - Td; Te = Ta + Td; T1a = T3 - T6; T7 = T3 + T6; { E Tq, TF, TG, Tv; Tq = Tm + Tp; TF = Tm - Tp; TG = Tr - Tu; Tv = Tr + Tu; TU = T7 - Te; Tf = T7 + Te; T1l = TF - TG; TH = TF + TG; Tw = Tq - Tv; T1q = Tq + Tv; } } } { E TX, T10, T1c, T13, T1h, T1E, T1H, T1C, T1K, T1G, T1L, T1F; { E TQ, Tx, T1y, TI, Tg, Tz; TX = TP - TM; TQ = TM + TP; Tx = FMA(KP707106781, Tw, Tl); T10 = FNMS(KP707106781, Tw, Tl); T1c = T1a + T1b; T1y = T1a - T1b; T13 = FNMS(KP707106781, TH, TE); TI = FMA(KP707106781, TH, TE); Tg = W[0]; Tz = W[1]; { E T1B, T1A, T1x, T1J, T1z, T1D; { E TR, Ty, TS, TJ; T1B = T1g - T1f; T1h = T1f + T1g; T1A = W[11]; TR = Tg * TI; Ty = Tg * Tx; T1x = W[10]; T1J = T1A * T1y; TS = FNMS(Tz, Tx, TR); TJ = FMA(Tz, TI, Ty); T1z = T1x * T1y; T1m = FMA(KP707106781, T1l, T1k); T1E = FNMS(KP707106781, T1l, T1k); Im[0] = TS - TQ; Ip[0] = TQ + TS; Rm[0] = Tf + TJ; Rp[0] = Tf - TJ; T1H = FMA(KP707106781, T1q, T1p); T1r = FNMS(KP707106781, T1q, T1p); T1D = W[12]; } T1C = FNMS(T1A, T1B, T1z); T1K = FMA(T1x, T1B, T1J); T1G = W[13]; T1L = T1D * T1H; T1F = T1D * T1E; } } { E TY, T16, T12, T17, T11; { E TW, TT, T15, TV, TZ, T1M, T1I; TW = W[7]; T1M = FNMS(T1G, T1E, T1L); T1I = FMA(T1G, T1H, T1F); TT = W[6]; T15 = TW * TU; Im[WS(rs, 3)] = T1M - T1K; Ip[WS(rs, 3)] = T1K + T1M; Rm[WS(rs, 3)] = T1C + T1I; Rp[WS(rs, 3)] = T1C - T1I; TV = TT * TU; TZ = W[8]; TY = FNMS(TW, TX, TV); T16 = FMA(TT, TX, T15); T12 = W[9]; T17 = TZ * T13; T11 = TZ * T10; } { E T1e, T19, T1t, T1d, T1j, T18, T14; T1e = W[3]; T18 = FNMS(T12, T10, T17); T14 = FMA(T12, T13, T11); T19 = W[2]; T1t = T1e * T1c; Im[WS(rs, 2)] = T18 - T16; Ip[WS(rs, 2)] = T16 + T18; Rm[WS(rs, 2)] = TY + T14; Rp[WS(rs, 2)] = TY - T14; T1d = T19 * T1c; T1j = W[4]; T1i = FNMS(T1e, T1h, T1d); T1u = FMA(T19, T1h, T1t); T1o = W[5]; T1v = T1j * T1r; T1n = T1j * T1m; } } } } T1w = FNMS(T1o, T1m, T1v); T1s = FMA(T1o, T1r, T1n); Im[WS(rs, 1)] = T1w - T1u; Ip[WS(rs, 1)] = T1u + T1w; Rm[WS(rs, 1)] = T1i + T1s; Rp[WS(rs, 1)] = T1i - T1s; } } }
static const R *hb_10(R *rio, R *iio, const R *W, stride ios, int m, int dist) { DK(KP250000000, +0.250000000000000000000000000000000000000000000); DK(KP951056516, +0.951056516295153572116439333379382143405698634); DK(KP587785252, +0.587785252292473129168705954639072768597652438); DK(KP559016994, +0.559016994374947424102293417182819058860154590); int i; for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 18) { E T3, Tk, Tw, T1w, TO, TP, T1E, T1D, Tr, TX, Ti, T1l, TZ, T10, T1s; E T1p, T1z, T1B, TL, TS; { E T1, T2, Tu, Tv; T1 = rio[0]; T2 = iio[-WS(ios, 5)]; T3 = T1 + T2; Tk = T1 - T2; Tu = iio[0]; Tv = rio[WS(ios, 5)]; Tw = Tu + Tv; T1w = Tu - Tv; } { E T6, Tl, Tg, Tp, T9, Tm, Td, To; { E T4, T5, Te, Tf; T4 = rio[WS(ios, 2)]; T5 = iio[-WS(ios, 7)]; T6 = T4 + T5; Tl = T4 - T5; Te = iio[-WS(ios, 6)]; Tf = rio[WS(ios, 1)]; Tg = Te + Tf; Tp = Te - Tf; } { E T7, T8, Tb, Tc; T7 = iio[-WS(ios, 8)]; T8 = rio[WS(ios, 3)]; T9 = T7 + T8; Tm = T7 - T8; Tb = rio[WS(ios, 4)]; Tc = iio[-WS(ios, 9)]; Td = Tb + Tc; To = Tb - Tc; } TO = Tl - Tm; TP = To - Tp; T1E = Td - Tg; T1D = T6 - T9; { E Tn, Tq, Ta, Th; Tn = Tl + Tm; Tq = To + Tp; Tr = Tn + Tq; TX = KP559016994 * (Tn - Tq); Ta = T6 + T9; Th = Td + Tg; Ti = Ta + Th; T1l = KP559016994 * (Ta - Th); } } { E Tz, T1n, TJ, T1r, TC, T1o, TG, T1q; { E Tx, Ty, TH, TI; Tx = iio[-WS(ios, 2)]; Ty = rio[WS(ios, 7)]; Tz = Tx + Ty; T1n = Tx - Ty; TH = rio[WS(ios, 6)]; TI = iio[-WS(ios, 1)]; TJ = TH + TI; T1r = TI - TH; } { E TA, TB, TE, TF; TA = rio[WS(ios, 8)]; TB = iio[-WS(ios, 3)]; TC = TA + TB; T1o = TB - TA; TE = iio[-WS(ios, 4)]; TF = rio[WS(ios, 9)]; TG = TE + TF; T1q = TE - TF; } TZ = Tz + TC; T10 = TG + TJ; T1s = T1q - T1r; T1p = T1n - T1o; { E T1x, T1y, TD, TK; T1x = T1n + T1o; T1y = T1q + T1r; T1z = T1x + T1y; T1B = KP559016994 * (T1x - T1y); TD = Tz - TC; TK = TG - TJ; TL = TD + TK; TS = KP559016994 * (TD - TK); } } rio[0] = T3 + Ti; iio[-WS(ios, 9)] = T1w + T1z; { E Ts, TM, Tj, Tt; Ts = Tk + Tr; TM = Tw + TL; Tj = W[8]; Tt = W[9]; rio[WS(ios, 5)] = FNMS(Tt, TM, Tj * Ts); iio[-WS(ios, 4)] = FMA(Tt, Ts, Tj * TM); } { E T1t, T1F, T1Q, T1N, T1C, T1R, T1m, T1M, T1A, T1k; T1t = FNMS(KP951056516, T1s, KP587785252 * T1p); T1F = FNMS(KP951056516, T1E, KP587785252 * T1D); T1Q = FMA(KP951056516, T1D, KP587785252 * T1E); T1N = FMA(KP951056516, T1p, KP587785252 * T1s); T1A = FNMS(KP250000000, T1z, T1w); T1C = T1A - T1B; T1R = T1B + T1A; T1k = FNMS(KP250000000, Ti, T3); T1m = T1k - T1l; T1M = T1l + T1k; { E T1u, T1G, T1j, T1v; T1u = T1m + T1t; T1G = T1C - T1F; T1j = W[14]; T1v = W[15]; rio[WS(ios, 8)] = FNMS(T1v, T1G, T1j * T1u); iio[-WS(ios, 1)] = FMA(T1v, T1u, T1j * T1G); } { E T1U, T1W, T1T, T1V; T1U = T1M + T1N; T1W = T1R - T1Q; T1T = W[6]; T1V = W[7]; rio[WS(ios, 4)] = FNMS(T1V, T1W, T1T * T1U); iio[-WS(ios, 5)] = FMA(T1V, T1U, T1T * T1W); } { E T1I, T1K, T1H, T1J; T1I = T1m - T1t; T1K = T1F + T1C; T1H = W[2]; T1J = W[3]; rio[WS(ios, 2)] = FNMS(T1J, T1K, T1H * T1I); iio[-WS(ios, 7)] = FMA(T1J, T1I, T1H * T1K); } { E T1O, T1S, T1L, T1P; T1O = T1M - T1N; T1S = T1Q + T1R; T1L = W[10]; T1P = W[11]; rio[WS(ios, 6)] = FNMS(T1P, T1S, T1L * T1O); iio[-WS(ios, 3)] = FMA(T1P, T1O, T1L * T1S); } } { E TQ, T11, T1c, T19, TY, T18, TT, T1d, TW, TR; TQ = FNMS(KP951056516, TP, KP587785252 * TO); T11 = FNMS(KP951056516, T10, KP587785252 * TZ); T1c = FMA(KP951056516, TO, KP587785252 * TP); T19 = FMA(KP951056516, TZ, KP587785252 * T10); TW = FNMS(KP250000000, Tr, Tk); TY = TW - TX; T18 = TX + TW; TR = FNMS(KP250000000, TL, Tw); TT = TR - TS; T1d = TS + TR; { E TU, T12, TN, TV; TU = TQ + TT; T12 = TY - T11; TN = W[12]; TV = W[13]; iio[-WS(ios, 2)] = FMA(TN, TU, TV * T12); rio[WS(ios, 7)] = FNMS(TV, TU, TN * T12); } { E T1g, T1i, T1f, T1h; T1g = T1d - T1c; T1i = T18 + T19; T1f = W[16]; T1h = W[17]; iio[0] = FMA(T1f, T1g, T1h * T1i); rio[WS(ios, 9)] = FNMS(T1h, T1g, T1f * T1i); } { E T14, T16, T13, T15; T14 = TY + T11; T16 = TT - TQ; T13 = W[4]; T15 = W[5]; rio[WS(ios, 3)] = FNMS(T15, T16, T13 * T14); iio[-WS(ios, 6)] = FMA(T13, T16, T15 * T14); } { E T1a, T1e, T17, T1b; T1a = T18 - T19; T1e = T1c + T1d; T17 = W[0]; T1b = W[1]; rio[WS(ios, 1)] = FNMS(T1b, T1e, T17 * T1a); iio[-WS(ios, 8)] = FMA(T17, T1e, T1b * T1a); } } } return W; }
static void q1_2(float *rio, float *iio, const float *W, stride rs, stride vs, INT mb, INT me, INT ms) { INT m; for (m = mb, W = W + (mb * 2); m < me; m = m + 1, rio = rio + ms, iio = iio + ms, W = W + 2, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(vs)) { E T1, T2, T4, T6, T7, T8, T9, Ta, Tc, Te, Tf, Tg; T1 = rio[0]; T2 = rio[WS(rs, 1)]; T4 = T1 - T2; T6 = iio[0]; T7 = iio[WS(rs, 1)]; T8 = T6 - T7; T9 = rio[WS(vs, 1)]; Ta = rio[WS(vs, 1) + WS(rs, 1)]; Tc = T9 - Ta; Te = iio[WS(vs, 1)]; Tf = iio[WS(vs, 1) + WS(rs, 1)]; Tg = Te - Tf; rio[0] = T1 + T2; iio[0] = T6 + T7; rio[WS(rs, 1)] = T9 + Ta; iio[WS(rs, 1)] = Te + Tf; { E Tb, Td, T3, T5; Tb = W[0]; Td = W[1]; rio[WS(vs, 1) + WS(rs, 1)] = FMA(Tb, Tc, Td * Tg); iio[WS(vs, 1) + WS(rs, 1)] = FNMS(Td, Tc, Tb * Tg); T3 = W[0]; T5 = W[1]; rio[WS(vs, 1)] = FMA(T3, T4, T5 * T8); iio[WS(vs, 1)] = FNMS(T5, T4, T3 * T8); } } }
static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP951056516, +0.951056516295153572116439333379382143405698634); DK(KP559016994, +0.559016994374947424102293417182819058860154590); DK(KP618033988, +0.618033988749894848204586834365638117720309180); DK(KP250000000, +0.250000000000000000000000000000000000000000000); { INT m; for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(rs)) { E T1T, T1Q, T1P; { E T2W, T4e, T7, TE, T3z, T4z, T1t, T2l, T3a, T3G, T13, T33, T3H, T1i, T2g; E T4H, T4G, T2d, T1B, T4u, T4B, T4A, T4r, T1A, T2s, T3l, T2t, T3s, T2o, T2q; E T1w, T1y, TC, T29, T3E, T3C, T4n, T4l, TN, TL; { E T4, T2U, T3, T2V, T1s, T5, T1n, T1o; { E T1, T2, T1q, T1r; T1 = cr[0]; T2 = ci[WS(rs, 9)]; T1q = ci[WS(rs, 14)]; T1r = cr[WS(rs, 15)]; T4 = cr[WS(rs, 5)]; T2U = T1 - T2; T3 = T1 + T2; T2V = T1q + T1r; T1s = T1q - T1r; T5 = ci[WS(rs, 4)]; T1n = ci[WS(rs, 19)]; T1o = cr[WS(rs, 10)]; } { E T3y, T6, T3x, T1p; T2W = T2U + T2V; T4e = T2U - T2V; T3y = T4 - T5; T6 = T4 + T5; T3x = T1n + T1o; T1p = T1n - T1o; T7 = T3 + T6; TE = T3 - T6; T3z = T3x - T3y; T4z = T3y + T3x; T1t = T1p - T1s; T2l = T1p + T1s; } } { E T2Z, T4f, Te, TF, T3o, T4p, T1a, T2b, TJ, TA, T4t, T3k, T4j, T39, T2f; E T12, T32, T4g, Tl, TG, T3r, T4q, T1h, T2c, T36, T4i, Tt, TI, T3h, T4s; E TV, T2e; { E Tb, T2X, Ta, T2Y, T19, Tc, T14, T15; { E T8, T9, T17, T18; T8 = cr[WS(rs, 4)]; T9 = ci[WS(rs, 5)]; T17 = ci[WS(rs, 10)]; T18 = cr[WS(rs, 19)]; Tb = cr[WS(rs, 9)]; T2X = T8 - T9; Ta = T8 + T9; T2Y = T17 + T18; T19 = T17 - T18; Tc = ci[0]; T14 = ci[WS(rs, 15)]; T15 = cr[WS(rs, 14)]; } { E T3n, Td, T3m, T16; T2Z = T2X + T2Y; T4f = T2X - T2Y; T3n = Tb - Tc; Td = Tb + Tc; T3m = T14 + T15; T16 = T14 - T15; Te = Ta + Td; TF = Ta - Td; T3o = T3m - T3n; T4p = T3n + T3m; T1a = T16 - T19; T2b = T16 + T19; } } { E TW, T37, Tw, T3i, Tz, TX, TZ, T10; { E Tu, Tv, Tx, Ty; Tu = ci[WS(rs, 7)]; Tv = cr[WS(rs, 2)]; Tx = ci[WS(rs, 2)]; Ty = cr[WS(rs, 7)]; TW = ci[WS(rs, 17)]; T37 = Tu - Tv; Tw = Tu + Tv; T3i = Tx - Ty; Tz = Tx + Ty; TX = cr[WS(rs, 12)]; TZ = ci[WS(rs, 12)]; T10 = cr[WS(rs, 17)]; } { E TY, T38, T11, T3j; TJ = Tw - Tz; TA = Tw + Tz; T3j = TW + TX; TY = TW - TX; T38 = TZ + T10; T11 = TZ - T10; T4t = T3i - T3j; T3k = T3i + T3j; T4j = T37 + T38; T39 = T37 - T38; T2f = TY + T11; T12 = TY - T11; } } { E Ti, T30, Th, T31, T1g, Tj, T1b, T1c; { E Tf, Tg, T1e, T1f; Tf = ci[WS(rs, 3)]; Tg = cr[WS(rs, 6)]; T1e = ci[WS(rs, 18)]; T1f = cr[WS(rs, 11)]; Ti = cr[WS(rs, 1)]; T30 = Tf - Tg; Th = Tf + Tg; T31 = T1e + T1f; T1g = T1e - T1f; Tj = ci[WS(rs, 8)]; T1b = ci[WS(rs, 13)]; T1c = cr[WS(rs, 16)]; } { E T3p, Tk, T3q, T1d; T32 = T30 + T31; T4g = T30 - T31; T3p = Ti - Tj; Tk = Ti + Tj; T3q = T1b + T1c; T1d = T1b - T1c; Tl = Th + Tk; TG = Th - Tk; T3r = T3p + T3q; T4q = T3p - T3q; T1h = T1d - T1g; T2c = T1d + T1g; } } { E Tq, T34, Tp, T35, TU, Tr, TP, TQ; { E Tn, To, TS, TT; Tn = cr[WS(rs, 8)]; To = ci[WS(rs, 1)]; TS = ci[WS(rs, 16)]; TT = cr[WS(rs, 13)]; Tq = ci[WS(rs, 6)]; T34 = Tn - To; Tp = Tn + To; T35 = TS + TT; TU = TS - TT; Tr = cr[WS(rs, 3)]; TP = ci[WS(rs, 11)]; TQ = cr[WS(rs, 18)]; } { E T3g, Ts, T3f, TR; T36 = T34 - T35; T4i = T34 + T35; T3g = Tq - Tr; Ts = Tq + Tr; T3f = TP + TQ; TR = TP - TQ; Tt = Tp + Ts; TI = Tp - Ts; T3h = T3f - T3g; T4s = T3g + T3f; TV = TR - TU; T2e = TR + TU; } } { E T1v, T1u, T2n, T4k, T4h, T2m, TH, TK; T3a = T36 + T39; T3G = T36 - T39; T13 = TV - T12; T1v = TV + T12; T33 = T2Z + T32; T3H = T2Z - T32; T1i = T1a - T1h; T1u = T1a + T1h; T2n = T2e + T2f; T2g = T2e - T2f; T4H = T4i - T4j; T4k = T4i + T4j; T4h = T4f + T4g; T4G = T4f - T4g; T2d = T2b - T2c; T2m = T2b + T2c; TH = TF + TG; T1B = TF - TG; T4u = T4s - T4t; T4B = T4s + T4t; T4A = T4p + T4q; T4r = T4p - T4q; T1A = TI - TJ; TK = TI + TJ; { E Tm, T3B, TB, T3A; Tm = Te + Tl; T2s = Te - Tl; T3l = T3h + T3k; T3B = T3h - T3k; TB = Tt + TA; T2t = Tt - TA; T3s = T3o + T3r; T3A = T3o - T3r; T2o = T2m + T2n; T2q = T2m - T2n; T1w = T1u + T1v; T1y = T1u - T1v; TC = Tm + TB; T29 = Tm - TB; T3E = T3A - T3B; T3C = T3A + T3B; T4n = T4h - T4k; T4l = T4h + T4k; TN = TH - TK; TL = TH + TK; } } } { E T3d, T3b, T4E, T1x, TM, T4m, T58, T5b, T4D, T5a, T5c, T59, T4C; cr[0] = T7 + TC; T3d = T33 - T3a; T3b = T33 + T3a; T4E = T4A - T4B; T4C = T4A + T4B; ci[0] = T2l + T2o; { E T25, T22, T21, T24, T23, T26, T57; T1x = FNMS(KP250000000, T1w, T1t); T25 = T1t + T1w; T22 = TE + TL; TM = FNMS(KP250000000, TL, TE); T21 = W[18]; T24 = W[19]; T4m = FNMS(KP250000000, T4l, T4e); T58 = T4e + T4l; T5b = T4z + T4C; T4D = FNMS(KP250000000, T4C, T4z); T23 = T21 * T22; T26 = T24 * T22; T57 = W[8]; T5a = W[9]; cr[WS(rs, 10)] = FNMS(T24, T25, T23); ci[WS(rs, 10)] = FMA(T21, T25, T26); T5c = T57 * T5b; T59 = T57 * T58; } { E T3U, T3Z, T3W, T40, T3V; { E T3c, T48, T4b, T3D, T47, T4a; T3c = FNMS(KP250000000, T3b, T2W); T48 = T2W + T3b; T4b = T3z + T3C; T3D = FNMS(KP250000000, T3C, T3z); ci[WS(rs, 5)] = FMA(T5a, T58, T5c); cr[WS(rs, 5)] = FNMS(T5a, T5b, T59); T47 = W[28]; T4a = W[29]; { E T3I, T3Y, T42, T3u, T3M, T3X, T3F; { E T3T, T3t, T4c, T49, T3e, T3S; T3T = FMA(KP618033988, T3l, T3s); T3t = FNMS(KP618033988, T3s, T3l); T4c = T47 * T4b; T49 = T47 * T48; T3I = FNMS(KP618033988, T3H, T3G); T3Y = FMA(KP618033988, T3G, T3H); ci[WS(rs, 15)] = FMA(T4a, T48, T4c); cr[WS(rs, 15)] = FNMS(T4a, T4b, T49); T3e = FNMS(KP559016994, T3d, T3c); T3S = FMA(KP559016994, T3d, T3c); T42 = FMA(KP951056516, T3T, T3S); T3U = FNMS(KP951056516, T3T, T3S); T3u = FNMS(KP951056516, T3t, T3e); T3M = FMA(KP951056516, T3t, T3e); T3X = FMA(KP559016994, T3E, T3D); T3F = FNMS(KP559016994, T3E, T3D); } { E T3P, T45, T44, T46, T43; { E T3w, T3J, T3v, T3K, T2T, T41; T2T = W[4]; T3w = W[5]; T3J = FMA(KP951056516, T3I, T3F); T3P = FNMS(KP951056516, T3I, T3F); T45 = FNMS(KP951056516, T3Y, T3X); T3Z = FMA(KP951056516, T3Y, T3X); T3v = T2T * T3u; T3K = T2T * T3J; T41 = W[36]; T44 = W[37]; cr[WS(rs, 3)] = FNMS(T3w, T3J, T3v); ci[WS(rs, 3)] = FMA(T3w, T3u, T3K); T46 = T41 * T45; T43 = T41 * T42; } { E T3O, T3Q, T3N, T3L, T3R; T3L = W[12]; T3O = W[13]; ci[WS(rs, 19)] = FMA(T44, T42, T46); cr[WS(rs, 19)] = FNMS(T44, T45, T43); T3Q = T3L * T3P; T3N = T3L * T3M; T3R = W[20]; T3W = W[21]; ci[WS(rs, 7)] = FMA(T3O, T3M, T3Q); cr[WS(rs, 7)] = FNMS(T3O, T3P, T3N); T40 = T3R * T3Z; T3V = T3R * T3U; } } } } { E T4U, T4Z, T4W, T50, T4V, T2L, T2I, T2H; { E T4T, T4v, T4I, T4Y, T4o, T4S; T4T = FNMS(KP618033988, T4r, T4u); T4v = FMA(KP618033988, T4u, T4r); ci[WS(rs, 11)] = FMA(T3W, T3U, T40); cr[WS(rs, 11)] = FNMS(T3W, T3Z, T3V); T4I = FMA(KP618033988, T4H, T4G); T4Y = FNMS(KP618033988, T4G, T4H); T4o = FMA(KP559016994, T4n, T4m); T4S = FNMS(KP559016994, T4n, T4m); { E T52, T4M, T55, T4P, T54, T56, T53; { E T4d, T4w, T4J, T4x, T4y, T4X, T4F, T51, T4K; T4d = W[0]; T4X = FNMS(KP559016994, T4E, T4D); T4F = FMA(KP559016994, T4E, T4D); T4U = FNMS(KP951056516, T4T, T4S); T52 = FMA(KP951056516, T4T, T4S); T4M = FMA(KP951056516, T4v, T4o); T4w = FNMS(KP951056516, T4v, T4o); T4Z = FMA(KP951056516, T4Y, T4X); T55 = FNMS(KP951056516, T4Y, T4X); T4P = FNMS(KP951056516, T4I, T4F); T4J = FMA(KP951056516, T4I, T4F); T4x = T4d * T4w; T4y = W[1]; T51 = W[32]; T4K = T4d * T4J; T54 = W[33]; cr[WS(rs, 1)] = FNMS(T4y, T4J, T4x); T56 = T51 * T55; T53 = T51 * T52; ci[WS(rs, 1)] = FMA(T4y, T4w, T4K); } { E T4O, T4Q, T4N, T4L, T4R; T4L = W[16]; ci[WS(rs, 17)] = FMA(T54, T52, T56); cr[WS(rs, 17)] = FNMS(T54, T55, T53); T4O = W[17]; T4Q = T4L * T4P; T4N = T4L * T4M; T4R = W[24]; T4W = W[25]; ci[WS(rs, 9)] = FMA(T4O, T4M, T4Q); cr[WS(rs, 9)] = FNMS(T4O, T4P, T4N); T50 = T4R * T4Z; T4V = T4R * T4U; } } } { E T2K, T2u, T2F, T2h, T28, T2J, T2r, T2p; T2K = FNMS(KP618033988, T2s, T2t); T2u = FMA(KP618033988, T2t, T2s); ci[WS(rs, 13)] = FMA(T4W, T4U, T50); cr[WS(rs, 13)] = FNMS(T4W, T4Z, T4V); T2p = FNMS(KP250000000, T2o, T2l); T2F = FNMS(KP618033988, T2d, T2g); T2h = FMA(KP618033988, T2g, T2d); T28 = FNMS(KP250000000, TC, T7); T2J = FNMS(KP559016994, T2q, T2p); T2r = FMA(KP559016994, T2q, T2p); { E T2B, T2G, T2y, T2R, T2Q, T2P, T2A, T2x; { E T2k, T2v, T27, T2O, T2i, T2a, T2E; T2k = W[7]; T2a = FMA(KP559016994, T29, T28); T2E = FNMS(KP559016994, T29, T28); T2B = FMA(KP951056516, T2u, T2r); T2v = FNMS(KP951056516, T2u, T2r); T27 = W[6]; T2O = FMA(KP951056516, T2F, T2E); T2G = FNMS(KP951056516, T2F, T2E); T2i = FMA(KP951056516, T2h, T2a); T2y = FNMS(KP951056516, T2h, T2a); { E T2N, T2j, T2w, T2S; T2L = FMA(KP951056516, T2K, T2J); T2R = FNMS(KP951056516, T2K, T2J); T2Q = W[23]; T2N = W[22]; T2j = T27 * T2i; T2w = T2k * T2i; T2S = T2Q * T2O; T2P = T2N * T2O; cr[WS(rs, 4)] = FNMS(T2k, T2v, T2j); ci[WS(rs, 4)] = FMA(T27, T2v, T2w); ci[WS(rs, 12)] = FMA(T2N, T2R, T2S); } } cr[WS(rs, 12)] = FNMS(T2Q, T2R, T2P); T2A = W[31]; T2x = W[30]; { E T2D, T2M, T2C, T2z; T2I = W[15]; T2C = T2A * T2y; T2z = T2x * T2y; T2D = W[14]; T2M = T2I * T2G; ci[WS(rs, 16)] = FMA(T2x, T2B, T2C); cr[WS(rs, 16)] = FNMS(T2A, T2B, T2z); T2H = T2D * T2G; ci[WS(rs, 8)] = FMA(T2D, T2L, T2M); } } } { E T1S, T1C, T1j, T1N, T1z, T1R; T1S = FMA(KP618033988, T1A, T1B); T1C = FNMS(KP618033988, T1B, T1A); cr[WS(rs, 8)] = FNMS(T2I, T2L, T2H); T1j = FNMS(KP618033988, T1i, T13); T1N = FMA(KP618033988, T13, T1i); T1z = FNMS(KP559016994, T1y, T1x); T1R = FMA(KP559016994, T1y, T1x); { E T1J, T1O, T1G, T1Z, T1Y, T1X, T1I, T1F; { E T1m, T1D, TD, T1W, T1k, T1M, TO; T1m = W[3]; T1M = FMA(KP559016994, TN, TM); TO = FNMS(KP559016994, TN, TM); T1D = FNMS(KP951056516, T1C, T1z); T1J = FMA(KP951056516, T1C, T1z); TD = W[2]; T1O = FNMS(KP951056516, T1N, T1M); T1W = FMA(KP951056516, T1N, T1M); T1G = FNMS(KP951056516, T1j, TO); T1k = FMA(KP951056516, T1j, TO); { E T1V, T1l, T1E, T20; T1Z = FNMS(KP951056516, T1S, T1R); T1T = FMA(KP951056516, T1S, T1R); T1Y = W[27]; T1V = W[26]; T1l = TD * T1k; T1E = T1m * T1k; T20 = T1Y * T1W; T1X = T1V * T1W; cr[WS(rs, 2)] = FNMS(T1m, T1D, T1l); ci[WS(rs, 2)] = FMA(TD, T1D, T1E); ci[WS(rs, 14)] = FMA(T1V, T1Z, T20); } } cr[WS(rs, 14)] = FNMS(T1Y, T1Z, T1X); T1I = W[35]; T1F = W[34]; { E T1L, T1U, T1K, T1H; T1Q = W[11]; T1K = T1I * T1G; T1H = T1F * T1G; T1L = W[10]; T1U = T1Q * T1O; ci[WS(rs, 18)] = FMA(T1F, T1J, T1K); cr[WS(rs, 18)] = FNMS(T1I, T1J, T1H); T1P = T1L * T1O; ci[WS(rs, 6)] = FMA(T1L, T1T, T1U); } } } } } } } cr[WS(rs, 6)] = FNMS(T1Q, T1T, T1P); } } }
static const R *hb_6(R *rio, R *iio, const R *W, stride ios, INT m, INT dist) { DK(KP500000000, +0.500000000000000000000000000000000000000000000); DK(KP866025403, +0.866025403784438646763723170752936183471402627); INT i; for (i = m - 2; i > 0; i = i - 2, rio = rio + dist, iio = iio - dist, W = W + 10, MAKE_VOLATILE_STRIDE(ios)) { E T3, Ty, Tp, TE, Ta, TO, Tm, TB, Tj, TL, Tq, TH; { E T1, T2, Tn, To; T1 = rio[0]; T2 = iio[-WS(ios, 3)]; T3 = T1 + T2; Ty = T1 - T2; Tn = iio[0]; To = rio[WS(ios, 3)]; Tp = Tn - To; TE = Tn + To; } { E T6, Tz, T9, TA; { E T4, T5, T7, T8; T4 = rio[WS(ios, 2)]; T5 = iio[-WS(ios, 5)]; T6 = T4 + T5; Tz = T4 - T5; T7 = iio[-WS(ios, 4)]; T8 = rio[WS(ios, 1)]; T9 = T7 + T8; TA = T7 - T8; } Ta = T6 + T9; TO = KP866025403 * (Tz - TA); Tm = KP866025403 * (T6 - T9); TB = Tz + TA; } { E Tf, TF, Ti, TG; { E Td, Te, Tg, Th; Td = iio[-WS(ios, 1)]; Te = rio[WS(ios, 4)]; Tf = Td - Te; TF = Te + Td; Tg = iio[-WS(ios, 2)]; Th = rio[WS(ios, 5)]; Ti = Tg - Th; TG = Tg + Th; } Tj = KP866025403 * (Tf - Ti); TL = KP866025403 * (TF + TG); Tq = Tf + Ti; TH = TF - TG; } rio[0] = T3 + Ta; iio[-WS(ios, 5)] = Tp + Tq; { E TC, TI, Tx, TD; TC = Ty + TB; TI = TE - TH; Tx = W[4]; TD = W[5]; rio[WS(ios, 3)] = FNMS(TD, TI, Tx * TC); iio[-WS(ios, 2)] = FMA(TD, TC, Tx * TI); } { E Tk, Tu, Ts, Tw, Tc, Tr; Tc = FNMS(KP500000000, Ta, T3); Tk = Tc + Tj; Tu = Tc - Tj; Tr = FNMS(KP500000000, Tq, Tp); Ts = Tm + Tr; Tw = Tr - Tm; { E Tb, Tl, Tt, Tv; Tb = W[6]; Tl = W[7]; rio[WS(ios, 4)] = FNMS(Tl, Ts, Tb * Tk); iio[-WS(ios, 1)] = FMA(Tl, Tk, Tb * Ts); Tt = W[2]; Tv = W[3]; rio[WS(ios, 2)] = FNMS(Tv, Tw, Tt * Tu); iio[-WS(ios, 3)] = FMA(Tv, Tu, Tt * Tw); } } { E TM, TU, TQ, TS, TK, TP; TK = FNMS(KP500000000, TB, Ty); TM = TK - TL; TU = TK + TL; TP = FMA(KP500000000, TH, TE); TQ = TO + TP; TS = TP - TO; { E TJ, TN, TR, TT; TJ = W[0]; TN = W[1]; rio[WS(ios, 1)] = FNMS(TN, TQ, TJ * TM); iio[-WS(ios, 4)] = FMA(TJ, TQ, TN * TM); TR = W[8]; TT = W[9]; iio[0] = FMA(TR, TS, TT * TU); rio[WS(ios, 5)] = FNMS(TT, TS, TR * TU); } } } return W; }
static void hb_20(R *cr, R *ci, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP250000000, +0.250000000000000000000000000000000000000000000); DK(KP559016994, +0.559016994374947424102293417182819058860154590); DK(KP587785252, +0.587785252292473129168705954639072768597652438); DK(KP951056516, +0.951056516295153572116439333379382143405698634); { INT m; for (m = mb, W = W + ((mb - 1) * 38); m < me; m = m + 1, cr = cr + ms, ci = ci - ms, W = W + 38, MAKE_VOLATILE_STRIDE(rs)) { E T7, T3T, T49, TE, T1v, T2T, T3g, T2d, T13, T3n, T3o, T1i, T26, T4e, T4d; E T23, T1n, T42, T3Z, T1m, T2h, T2I, T2i, T2P, T30, T37, T38, Tm, TB, TC; E T46, T47, T4a, T2a, T2b, T2e, T1w, T1x, T1y, T3O, T3R, T3U, T3h, T3i, T3j; E TH, TK, TL; { E T3, T2R, T1u, T2S, T6, T3f, T1r, T3e; { E T1, T2, T1s, T1t; T1 = cr[0]; T2 = ci[WS(rs, 9)]; T3 = T1 + T2; T2R = T1 - T2; T1s = ci[WS(rs, 14)]; T1t = cr[WS(rs, 15)]; T1u = T1s - T1t; T2S = T1s + T1t; } { E T4, T5, T1p, T1q; T4 = cr[WS(rs, 5)]; T5 = ci[WS(rs, 4)]; T6 = T4 + T5; T3f = T4 - T5; T1p = ci[WS(rs, 19)]; T1q = cr[WS(rs, 10)]; T1r = T1p - T1q; T3e = T1p + T1q; } T7 = T3 + T6; T3T = T2R - T2S; T49 = T3f + T3e; TE = T3 - T6; T1v = T1r - T1u; T2T = T2R + T2S; T3g = T3e - T3f; T2d = T1r + T1u; } { E Te, T3M, T3X, TF, TV, T2E, T2W, T21, TA, T3Q, T41, TJ, T1h, T2O, T36; E T25, Tl, T3N, T3Y, TG, T12, T2H, T2Z, T22, Tt, T3P, T40, TI, T1a, T2L; E T33, T24; { E Ta, T2U, TU, T2V, Td, T2D, TR, T2C; { E T8, T9, TS, TT; T8 = cr[WS(rs, 4)]; T9 = ci[WS(rs, 5)]; Ta = T8 + T9; T2U = T8 - T9; TS = ci[WS(rs, 10)]; TT = cr[WS(rs, 19)]; TU = TS - TT; T2V = TS + TT; } { E Tb, Tc, TP, TQ; Tb = cr[WS(rs, 9)]; Tc = ci[0]; Td = Tb + Tc; T2D = Tb - Tc; TP = ci[WS(rs, 15)]; TQ = cr[WS(rs, 14)]; TR = TP - TQ; T2C = TP + TQ; } Te = Ta + Td; T3M = T2U - T2V; T3X = T2D + T2C; TF = Ta - Td; TV = TR - TU; T2E = T2C - T2D; T2W = T2U + T2V; T21 = TR + TU; } { E Tw, T34, Tz, T2M, T1d, T2N, T1g, T35; { E Tu, Tv, Tx, Ty; Tu = ci[WS(rs, 7)]; Tv = cr[WS(rs, 2)]; Tw = Tu + Tv; T34 = Tu - Tv; Tx = ci[WS(rs, 2)]; Ty = cr[WS(rs, 7)]; Tz = Tx + Ty; T2M = Tx - Ty; } { E T1b, T1c, T1e, T1f; T1b = ci[WS(rs, 17)]; T1c = cr[WS(rs, 12)]; T1d = T1b - T1c; T2N = T1b + T1c; T1e = ci[WS(rs, 12)]; T1f = cr[WS(rs, 17)]; T1g = T1e - T1f; T35 = T1e + T1f; } TA = Tw + Tz; T3Q = T34 + T35; T41 = T2M - T2N; TJ = Tw - Tz; T1h = T1d - T1g; T2O = T2M + T2N; T36 = T34 - T35; T25 = T1d + T1g; } { E Th, T2X, T11, T2Y, Tk, T2F, TY, T2G; { E Tf, Tg, TZ, T10; Tf = ci[WS(rs, 3)]; Tg = cr[WS(rs, 6)]; Th = Tf + Tg; T2X = Tf - Tg; TZ = ci[WS(rs, 18)]; T10 = cr[WS(rs, 11)]; T11 = TZ - T10; T2Y = TZ + T10; } { E Ti, Tj, TW, TX; Ti = cr[WS(rs, 1)]; Tj = ci[WS(rs, 8)]; Tk = Ti + Tj; T2F = Ti - Tj; TW = ci[WS(rs, 13)]; TX = cr[WS(rs, 16)]; TY = TW - TX; T2G = TW + TX; } Tl = Th + Tk; T3N = T2X - T2Y; T3Y = T2F - T2G; TG = Th - Tk; T12 = TY - T11; T2H = T2F + T2G; T2Z = T2X + T2Y; T22 = TY + T11; } { E Tp, T31, T19, T32, Ts, T2K, T16, T2J; { E Tn, To, T17, T18; Tn = cr[WS(rs, 8)]; To = ci[WS(rs, 1)]; Tp = Tn + To; T31 = Tn - To; T17 = ci[WS(rs, 16)]; T18 = cr[WS(rs, 13)]; T19 = T17 - T18; T32 = T17 + T18; } { E Tq, Tr, T14, T15; Tq = ci[WS(rs, 6)]; Tr = cr[WS(rs, 3)]; Ts = Tq + Tr; T2K = Tq - Tr; T14 = ci[WS(rs, 11)]; T15 = cr[WS(rs, 18)]; T16 = T14 - T15; T2J = T14 + T15; } Tt = Tp + Ts; T3P = T31 + T32; T40 = T2K + T2J; TI = Tp - Ts; T1a = T16 - T19; T2L = T2J - T2K; T33 = T31 - T32; T24 = T16 + T19; } T13 = TV - T12; T3n = T2W - T2Z; T3o = T33 - T36; T1i = T1a - T1h; T26 = T24 - T25; T4e = T3P - T3Q; T4d = T3M - T3N; T23 = T21 - T22; T1n = TI - TJ; T42 = T40 - T41; T3Z = T3X - T3Y; T1m = TF - TG; T2h = Te - Tl; T2I = T2E + T2H; T2i = Tt - TA; T2P = T2L + T2O; T30 = T2W + T2Z; T37 = T33 + T36; T38 = T30 + T37; Tm = Te + Tl; TB = Tt + TA; TC = Tm + TB; T46 = T3X + T3Y; T47 = T40 + T41; T4a = T46 + T47; T2a = T21 + T22; T2b = T24 + T25; T2e = T2a + T2b; T1w = TV + T12; T1x = T1a + T1h; T1y = T1w + T1x; T3O = T3M + T3N; T3R = T3P + T3Q; T3U = T3O + T3R; T3h = T2E - T2H; T3i = T2L - T2O; T3j = T3h + T3i; TH = TF + TG; TK = TI + TJ; TL = TH + TK; } cr[0] = T7 + TC; ci[0] = T2d + T2e; { E T1U, T1W, T1T, T1V; T1U = TE + TL; T1W = T1v + T1y; T1T = W[18]; T1V = W[19]; cr[WS(rs, 10)] = FNMS(T1V, T1W, T1T * T1U); ci[WS(rs, 10)] = FMA(T1V, T1U, T1T * T1W); } { E T4y, T4A, T4x, T4z; T4y = T3T + T3U; T4A = T49 + T4a; T4x = W[8]; T4z = W[9]; cr[WS(rs, 5)] = FNMS(T4z, T4A, T4x * T4y); ci[WS(rs, 5)] = FMA(T4x, T4A, T4z * T4y); } { E T3I, T3K, T3H, T3J; T3I = T2T + T38; T3K = T3g + T3j; T3H = W[28]; T3J = W[29]; cr[WS(rs, 15)] = FNMS(T3J, T3K, T3H * T3I); ci[WS(rs, 15)] = FMA(T3H, T3K, T3J * T3I); } { E T27, T2j, T2v, T2r, T2g, T2u, T20, T2q; T27 = FMA(KP951056516, T23, KP587785252 * T26); T2j = FMA(KP951056516, T2h, KP587785252 * T2i); T2v = FNMS(KP951056516, T2i, KP587785252 * T2h); T2r = FNMS(KP951056516, T26, KP587785252 * T23); { E T2c, T2f, T1Y, T1Z; T2c = KP559016994 * (T2a - T2b); T2f = FNMS(KP250000000, T2e, T2d); T2g = T2c + T2f; T2u = T2f - T2c; T1Y = KP559016994 * (Tm - TB); T1Z = FNMS(KP250000000, TC, T7); T20 = T1Y + T1Z; T2q = T1Z - T1Y; } { E T28, T2k, T1X, T29; T28 = T20 + T27; T2k = T2g - T2j; T1X = W[6]; T29 = W[7]; cr[WS(rs, 4)] = FNMS(T29, T2k, T1X * T28); ci[WS(rs, 4)] = FMA(T29, T28, T1X * T2k); } { E T2y, T2A, T2x, T2z; T2y = T2q - T2r; T2A = T2v + T2u; T2x = W[22]; T2z = W[23]; cr[WS(rs, 12)] = FNMS(T2z, T2A, T2x * T2y); ci[WS(rs, 12)] = FMA(T2z, T2y, T2x * T2A); } { E T2m, T2o, T2l, T2n; T2m = T20 - T27; T2o = T2j + T2g; T2l = W[30]; T2n = W[31]; cr[WS(rs, 16)] = FNMS(T2n, T2o, T2l * T2m); ci[WS(rs, 16)] = FMA(T2n, T2m, T2l * T2o); } { E T2s, T2w, T2p, T2t; T2s = T2q + T2r; T2w = T2u - T2v; T2p = W[14]; T2t = W[15]; cr[WS(rs, 8)] = FNMS(T2t, T2w, T2p * T2s); ci[WS(rs, 8)] = FMA(T2t, T2s, T2p * T2w); } } { E T43, T4f, T4r, T4m, T4c, T4q, T3W, T4n; T43 = FMA(KP951056516, T3Z, KP587785252 * T42); T4f = FMA(KP951056516, T4d, KP587785252 * T4e); T4r = FNMS(KP951056516, T4e, KP587785252 * T4d); T4m = FNMS(KP951056516, T42, KP587785252 * T3Z); { E T48, T4b, T3S, T3V; T48 = KP559016994 * (T46 - T47); T4b = FNMS(KP250000000, T4a, T49); T4c = T48 + T4b; T4q = T4b - T48; T3S = KP559016994 * (T3O - T3R); T3V = FNMS(KP250000000, T3U, T3T); T3W = T3S + T3V; T4n = T3V - T3S; } { E T44, T4g, T3L, T45; T44 = T3W - T43; T4g = T4c + T4f; T3L = W[0]; T45 = W[1]; cr[WS(rs, 1)] = FNMS(T45, T4g, T3L * T44); ci[WS(rs, 1)] = FMA(T3L, T4g, T45 * T44); } { E T4u, T4w, T4t, T4v; T4u = T4n - T4m; T4w = T4q + T4r; T4t = W[32]; T4v = W[33]; cr[WS(rs, 17)] = FNMS(T4v, T4w, T4t * T4u); ci[WS(rs, 17)] = FMA(T4t, T4w, T4v * T4u); } { E T4i, T4k, T4h, T4j; T4i = T43 + T3W; T4k = T4c - T4f; T4h = W[16]; T4j = W[17]; cr[WS(rs, 9)] = FNMS(T4j, T4k, T4h * T4i); ci[WS(rs, 9)] = FMA(T4h, T4k, T4j * T4i); } { E T4o, T4s, T4l, T4p; T4o = T4m + T4n; T4s = T4q - T4r; T4l = W[24]; T4p = W[25]; cr[WS(rs, 13)] = FNMS(T4p, T4s, T4l * T4o); ci[WS(rs, 13)] = FMA(T4l, T4s, T4p * T4o); } } { E T1j, T1o, T1M, T1J, T1B, T1N, TO, T1I; T1j = FNMS(KP951056516, T1i, KP587785252 * T13); T1o = FNMS(KP951056516, T1n, KP587785252 * T1m); T1M = FMA(KP951056516, T1m, KP587785252 * T1n); T1J = FMA(KP951056516, T13, KP587785252 * T1i); { E T1z, T1A, TM, TN; T1z = FNMS(KP250000000, T1y, T1v); T1A = KP559016994 * (T1w - T1x); T1B = T1z - T1A; T1N = T1A + T1z; TM = FNMS(KP250000000, TL, TE); TN = KP559016994 * (TH - TK); TO = TM - TN; T1I = TN + TM; } { E T1k, T1C, TD, T1l; T1k = TO - T1j; T1C = T1o + T1B; TD = W[2]; T1l = W[3]; cr[WS(rs, 2)] = FNMS(T1l, T1C, TD * T1k); ci[WS(rs, 2)] = FMA(T1l, T1k, TD * T1C); } { E T1Q, T1S, T1P, T1R; T1Q = T1I + T1J; T1S = T1N - T1M; T1P = W[26]; T1R = W[27]; cr[WS(rs, 14)] = FNMS(T1R, T1S, T1P * T1Q); ci[WS(rs, 14)] = FMA(T1R, T1Q, T1P * T1S); } { E T1E, T1G, T1D, T1F; T1E = TO + T1j; T1G = T1B - T1o; T1D = W[34]; T1F = W[35]; cr[WS(rs, 18)] = FNMS(T1F, T1G, T1D * T1E); ci[WS(rs, 18)] = FMA(T1F, T1E, T1D * T1G); } { E T1K, T1O, T1H, T1L; T1K = T1I - T1J; T1O = T1M + T1N; T1H = W[10]; T1L = W[11]; cr[WS(rs, 6)] = FNMS(T1L, T1O, T1H * T1K); ci[WS(rs, 6)] = FMA(T1L, T1K, T1H * T1O); } } { E T2Q, T3p, T3B, T3x, T3m, T3A, T3b, T3w; T2Q = FNMS(KP951056516, T2P, KP587785252 * T2I); T3p = FNMS(KP951056516, T3o, KP587785252 * T3n); T3B = FMA(KP951056516, T3n, KP587785252 * T3o); T3x = FMA(KP951056516, T2I, KP587785252 * T2P); { E T3k, T3l, T39, T3a; T3k = FNMS(KP250000000, T3j, T3g); T3l = KP559016994 * (T3h - T3i); T3m = T3k - T3l; T3A = T3l + T3k; T39 = FNMS(KP250000000, T38, T2T); T3a = KP559016994 * (T30 - T37); T3b = T39 - T3a; T3w = T3a + T39; } { E T3c, T3q, T2B, T3d; T3c = T2Q + T3b; T3q = T3m - T3p; T2B = W[4]; T3d = W[5]; cr[WS(rs, 3)] = FNMS(T3d, T3q, T2B * T3c); ci[WS(rs, 3)] = FMA(T2B, T3q, T3d * T3c); } { E T3E, T3G, T3D, T3F; T3E = T3x + T3w; T3G = T3A - T3B; T3D = W[36]; T3F = W[37]; cr[WS(rs, 19)] = FNMS(T3F, T3G, T3D * T3E); ci[WS(rs, 19)] = FMA(T3D, T3G, T3F * T3E); } { E T3s, T3u, T3r, T3t; T3s = T3b - T2Q; T3u = T3m + T3p; T3r = W[12]; T3t = W[13]; cr[WS(rs, 7)] = FNMS(T3t, T3u, T3r * T3s); ci[WS(rs, 7)] = FMA(T3r, T3u, T3t * T3s); } { E T3y, T3C, T3v, T3z; T3y = T3w - T3x; T3C = T3A + T3B; T3v = W[20]; T3z = W[21]; cr[WS(rs, 11)] = FNMS(T3z, T3C, T3v * T3y); ci[WS(rs, 11)] = FMA(T3v, T3C, T3z * T3y); } } } } }
static void q1bv_2(R *ri, R *ii, const R *W, stride rs, stride vs, INT mb, INT me, INT ms) { { INT m; R *x; x = ii; for (m = mb, W = W + (mb * ((TWVL / VL) * 2)); m < me; m = m + VL, x = x + (VL * ms), W = W + (TWVL * 2), MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(vs)) { V T1, T2, T3, T4, T5, T6; T1 = LD(&(x[0]), ms, &(x[0])); T2 = LD(&(x[WS(rs, 1)]), ms, &(x[WS(rs, 1)])); T3 = BYTW(&(W[0]), VSUB(T1, T2)); T4 = LD(&(x[WS(vs, 1)]), ms, &(x[WS(vs, 1)])); T5 = LD(&(x[WS(vs, 1) + WS(rs, 1)]), ms, &(x[WS(vs, 1) + WS(rs, 1)])); T6 = BYTW(&(W[0]), VSUB(T4, T5)); ST(&(x[WS(vs, 1)]), T3, ms, &(x[WS(vs, 1)])); ST(&(x[WS(vs, 1) + WS(rs, 1)]), T6, ms, &(x[WS(vs, 1) + WS(rs, 1)])); ST(&(x[0]), VADD(T1, T2), ms, &(x[0])); ST(&(x[WS(rs, 1)]), VADD(T4, T5), ms, &(x[WS(rs, 1)])); } } VLEAVE(); }
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP939692620, +0.939692620785908384054109277324731469936208134); DK(KP342020143, +0.342020143325668733044099614682259580763083368); DK(KP984807753, +0.984807753012208059366743024589523013670643252); DK(KP173648177, +0.173648177666930348851716626769314796000375677); DK(KP642787609, +0.642787609686539326322643409907263432907559884); DK(KP766044443, +0.766044443118978035202392650555416673935832457); DK(KP500000000, +0.500000000000000000000000000000000000000000000); DK(KP866025403, +0.866025403784438646763723170752936183471402627); INT i; for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) { E T5, TO, Th, Tk, T1g, TR, Ta, T1c, Tq, TW, Tv, TX, Tf, T1d, TB; E T10, TG, TZ; { E T1, T2, T3, T4; T1 = ri[0]; T2 = ri[WS(is, 3)]; T3 = ri[WS(is, 6)]; T4 = T2 + T3; T5 = T1 + T4; TO = KP866025403 * (T3 - T2); Th = FNMS(KP500000000, T4, T1); } { E TP, Ti, Tj, TQ; TP = ii[0]; Ti = ii[WS(is, 3)]; Tj = ii[WS(is, 6)]; TQ = Ti + Tj; Tk = KP866025403 * (Ti - Tj); T1g = TP + TQ; TR = FNMS(KP500000000, TQ, TP); } { E T6, Ts, T9, Tr, Tp, Tt, Tm, Tu; T6 = ri[WS(is, 1)]; Ts = ii[WS(is, 1)]; { E T7, T8, Tn, To; T7 = ri[WS(is, 4)]; T8 = ri[WS(is, 7)]; T9 = T7 + T8; Tr = KP866025403 * (T8 - T7); Tn = ii[WS(is, 4)]; To = ii[WS(is, 7)]; Tp = KP866025403 * (Tn - To); Tt = Tn + To; } Ta = T6 + T9; T1c = Ts + Tt; Tm = FNMS(KP500000000, T9, T6); Tq = Tm + Tp; TW = Tm - Tp; Tu = FNMS(KP500000000, Tt, Ts); Tv = Tr + Tu; TX = Tu - Tr; } { E Tb, TD, Te, TC, TA, TE, Tx, TF; Tb = ri[WS(is, 2)]; TD = ii[WS(is, 2)]; { E Tc, Td, Ty, Tz; Tc = ri[WS(is, 5)]; Td = ri[WS(is, 8)]; Te = Tc + Td; TC = KP866025403 * (Td - Tc); Ty = ii[WS(is, 5)]; Tz = ii[WS(is, 8)]; TA = KP866025403 * (Ty - Tz); TE = Ty + Tz; } Tf = Tb + Te; T1d = TD + TE; Tx = FNMS(KP500000000, Te, Tb); TB = Tx + TA; T10 = Tx - TA; TF = FNMS(KP500000000, TE, TD); TG = TC + TF; TZ = TF - TC; } { E T1e, Tg, T1b, T1f, T1h, T1i; T1e = KP866025403 * (T1c - T1d); Tg = Ta + Tf; T1b = FNMS(KP500000000, Tg, T5); ro[0] = T5 + Tg; ro[WS(os, 3)] = T1b + T1e; ro[WS(os, 6)] = T1b - T1e; T1f = KP866025403 * (Tf - Ta); T1h = T1c + T1d; T1i = FNMS(KP500000000, T1h, T1g); io[WS(os, 3)] = T1f + T1i; io[0] = T1g + T1h; io[WS(os, 6)] = T1i - T1f; } { E Tl, TS, TI, TN, TM, TT, TJ, TU; Tl = Th + Tk; TS = TO + TR; { E Tw, TH, TK, TL; Tw = FMA(KP766044443, Tq, KP642787609 * Tv); TH = FMA(KP173648177, TB, KP984807753 * TG); TI = Tw + TH; TN = KP866025403 * (TH - Tw); TK = FNMS(KP642787609, Tq, KP766044443 * Tv); TL = FNMS(KP984807753, TB, KP173648177 * TG); TM = KP866025403 * (TK - TL); TT = TK + TL; } ro[WS(os, 1)] = Tl + TI; io[WS(os, 1)] = TS + TT; TJ = FNMS(KP500000000, TI, Tl); ro[WS(os, 7)] = TJ - TM; ro[WS(os, 4)] = TJ + TM; TU = FNMS(KP500000000, TT, TS); io[WS(os, 4)] = TN + TU; io[WS(os, 7)] = TU - TN; } { E TV, T14, T12, T13, T17, T1a, T18, T19; TV = Th - Tk; T14 = TR - TO; { E TY, T11, T15, T16; TY = FMA(KP173648177, TW, KP984807753 * TX); T11 = FNMS(KP939692620, T10, KP342020143 * TZ); T12 = TY + T11; T13 = KP866025403 * (T11 - TY); T15 = FNMS(KP984807753, TW, KP173648177 * TX); T16 = FMA(KP342020143, T10, KP939692620 * TZ); T17 = T15 - T16; T1a = KP866025403 * (T15 + T16); } ro[WS(os, 2)] = TV + T12; io[WS(os, 2)] = T14 + T17; T18 = FNMS(KP500000000, T17, T14); io[WS(os, 5)] = T13 + T18; io[WS(os, 8)] = T18 - T13; T19 = FNMS(KP500000000, T12, TV); ro[WS(os, 8)] = T19 - T1a; ro[WS(os, 5)] = T19 + T1a; } } }
static void r2cbIII_20(float *R0, float *R1, float *Cr, float *Ci, stride rs, stride csr, stride csi, INT v, INT ivs, INT ovs) { DK(KP1_414213562, +1.414213562373095048801688724209698078569671875); DK(KP2_000000000, +2.000000000000000000000000000000000000000000000); DK(KP250000000, +0.250000000000000000000000000000000000000000000); DK(KP951056516, +0.951056516295153572116439333379382143405698634); DK(KP587785252, +0.587785252292473129168705954639072768597652438); DK(KP559016994, +0.559016994374947424102293417182819058860154590); INT i; for (i = v; i > 0; i = i - 1, R0 = R0 + ovs, R1 = R1 + ovs, Cr = Cr + ivs, Ci = Ci + ivs, MAKE_VOLATILE_STRIDE(rs), MAKE_VOLATILE_STRIDE(csr), MAKE_VOLATILE_STRIDE(csi)) { E T1, Tj, T1k, T13, T8, Tk, T17, Ts, T16, TI, T18, T19, Ta, Tu, T1i; E TS, Th, Tv, TX, TD, TV, TL, TW, TY; { E T7, T12, T4, T11; T1 = Cr[WS(csr, 2)]; { E T5, T6, T2, T3; T5 = Cr[WS(csr, 9)]; T6 = Cr[WS(csr, 5)]; T7 = T5 + T6; T12 = T5 - T6; T2 = Cr[WS(csr, 6)]; T3 = Cr[WS(csr, 1)]; T4 = T2 + T3; T11 = T2 - T3; } Tj = KP559016994 * (T4 - T7); T1k = FNMS(KP951056516, T12, KP587785252 * T11); T13 = FMA(KP951056516, T11, KP587785252 * T12); T8 = T4 + T7; Tk = FNMS(KP250000000, T8, T1); } { E Tr, T15, To, T14; T17 = Ci[WS(csi, 2)]; { E Tp, Tq, Tm, Tn; Tp = Ci[WS(csi, 5)]; Tq = Ci[WS(csi, 9)]; Tr = Tp - Tq; T15 = Tp + Tq; Tm = Ci[WS(csi, 6)]; Tn = Ci[WS(csi, 1)]; To = Tm + Tn; T14 = Tm - Tn; } Ts = FMA(KP951056516, To, KP587785252 * Tr); T16 = KP559016994 * (T14 + T15); TI = FNMS(KP951056516, Tr, KP587785252 * To); T18 = T14 - T15; T19 = FNMS(KP250000000, T18, T17); } { E Tg, TR, Td, TQ; Ta = Cr[WS(csr, 7)]; { E Te, Tf, Tb, Tc; Te = Cr[0]; Tf = Cr[WS(csr, 4)]; Tg = Te + Tf; TR = Te - Tf; Tb = Cr[WS(csr, 3)]; Tc = Cr[WS(csr, 8)]; Td = Tb + Tc; TQ = Tb - Tc; } Tu = KP559016994 * (Td - Tg); T1i = FNMS(KP951056516, TR, KP587785252 * TQ); TS = FMA(KP951056516, TQ, KP587785252 * TR); Th = Td + Tg; Tv = FNMS(KP250000000, Th, Ta); } { E TC, TU, Tz, TT; TX = Ci[WS(csi, 7)]; { E TA, TB, Tx, Ty; TA = Ci[WS(csi, 4)]; TB = Ci[0]; TC = TA - TB; TU = TB + TA; Tx = Ci[WS(csi, 3)]; Ty = Ci[WS(csi, 8)]; Tz = Tx + Ty; TT = Ty - Tx; } TD = FMA(KP951056516, Tz, KP587785252 * TC); TV = KP559016994 * (TT - TU); TL = FNMS(KP587785252, Tz, KP951056516 * TC); TW = TT + TU; TY = FMA(KP250000000, TW, TX); } { E T9, Ti, T1w, T1t, T1u, T1v; T9 = T1 + T8; Ti = Ta + Th; T1w = T9 - Ti; T1t = T18 + T17; T1u = TX - TW; T1v = T1t + T1u; R0[0] = KP2_000000000 * (T9 + Ti); R0[WS(rs, 5)] = KP2_000000000 * (T1u - T1t); R1[WS(rs, 2)] = KP1_414213562 * (T1v - T1w); R1[WS(rs, 7)] = KP1_414213562 * (T1w + T1v); } { E TJ, TO, T1m, T1q, TM, TN, T1j, T1r; { E TH, T1l, TK, T1h; TH = Tk - Tj; TJ = TH + TI; TO = TH - TI; T1l = T19 - T16; T1m = T1k + T1l; T1q = T1l - T1k; TK = Tv - Tu; TM = TK + TL; TN = TL - TK; T1h = TV + TY; T1j = T1h - T1i; T1r = T1i + T1h; } R0[WS(rs, 4)] = KP2_000000000 * (TJ + TM); R0[WS(rs, 6)] = KP2_000000000 * (TN - TO); R0[WS(rs, 9)] = KP2_000000000 * (T1r - T1q); R0[WS(rs, 1)] = KP2_000000000 * (T1j - T1m); { E T1p, T1s, T1n, T1o; T1p = TM - TJ; T1s = T1q + T1r; R1[WS(rs, 1)] = KP1_414213562 * (T1p - T1s); R1[WS(rs, 6)] = KP1_414213562 * (T1p + T1s); T1n = TO + TN; T1o = T1m + T1j; R1[WS(rs, 8)] = KP1_414213562 * (T1n - T1o); R1[WS(rs, 3)] = KP1_414213562 * (T1n + T1o); } } { E Tt, TG, T1b, T1f, TE, TF, T10, T1e; { E Tl, T1a, Tw, TZ; Tl = Tj + Tk; Tt = Tl - Ts; TG = Tl + Ts; T1a = T16 + T19; T1b = T13 + T1a; T1f = T1a - T13; Tw = Tu + Tv; TE = Tw + TD; TF = TD - Tw; TZ = TV - TY; T10 = TS + TZ; T1e = TZ - TS; } R0[WS(rs, 8)] = KP2_000000000 * (Tt + TE); R0[WS(rs, 2)] = KP2_000000000 * (TF - TG); R0[WS(rs, 7)] = KP2_000000000 * (T1f + T1e); R0[WS(rs, 3)] = KP2_000000000 * (T1b + T10); { E T1d, T1g, TP, T1c; T1d = TG + TF; T1g = T1e - T1f; R1[WS(rs, 4)] = KP1_414213562 * (T1d + T1g); R1[WS(rs, 9)] = KP1_414213562 * (T1g - T1d); TP = Tt - TE; T1c = T10 - T1b; R1[0] = KP1_414213562 * (TP + T1c); R1[WS(rs, 5)] = KP1_414213562 * (T1c - TP); } } } }
static void n1_9(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DK(KP954188894, +0.954188894138671133499268364187245676532219158); DK(KP363970234, +0.363970234266202361351047882776834043890471784); DK(KP852868531, +0.852868531952443209628250963940074071936020296); DK(KP984807753, +0.984807753012208059366743024589523013670643252); DK(KP492403876, +0.492403876506104029683371512294761506835321626); DK(KP777861913, +0.777861913430206160028177977318626690410586096); DK(KP839099631, +0.839099631177280011763127298123181364687434283); DK(KP176326980, +0.176326980708464973471090386868618986121633062); DK(KP866025403, +0.866025403784438646763723170752936183471402627); DK(KP500000000, +0.500000000000000000000000000000000000000000000); INT i; for (i = v; i > 0; i = i - 1, ri = ri + ivs, ii = ii + ivs, ro = ro + ovs, io = io + ovs, MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) { E T17, TV, T14, TY, T11, T15; { E Tm, TM, TL, T5, Tl, T1f, Tb, Tt, Ta, T1c, TI, TX, TF, TW, Tc; E Td, Tp, Tq; { E T1, Th, Ti, Tj, T4, T2, T3; T1 = ri[0]; T2 = ri[WS(is, 3)]; T3 = ri[WS(is, 6)]; Th = ii[0]; Ti = ii[WS(is, 3)]; Tj = ii[WS(is, 6)]; T4 = T2 + T3; Tm = T3 - T2; { E T6, Tz, T7, T8, TA, TB, Tk; T6 = ri[WS(is, 1)]; TM = Ti - Tj; Tk = Ti + Tj; TL = FNMS(KP500000000, T4, T1); T5 = T1 + T4; Tz = ii[WS(is, 1)]; Tl = FNMS(KP500000000, Tk, Th); T1f = Th + Tk; T7 = ri[WS(is, 4)]; T8 = ri[WS(is, 7)]; TA = ii[WS(is, 4)]; TB = ii[WS(is, 7)]; { E TE, T9, TH, TC, TG, TD; Tb = ri[WS(is, 2)]; TE = T7 - T8; T9 = T7 + T8; TH = TB - TA; TC = TA + TB; Tt = ii[WS(is, 2)]; Ta = T6 + T9; TG = FNMS(KP500000000, T9, T6); T1c = Tz + TC; TD = FNMS(KP500000000, TC, Tz); TI = FNMS(KP866025403, TH, TG); TX = FMA(KP866025403, TH, TG); TF = FNMS(KP866025403, TE, TD); TW = FMA(KP866025403, TE, TD); Tc = ri[WS(is, 5)]; Td = ri[WS(is, 8)]; Tp = ii[WS(is, 5)]; Tq = ii[WS(is, 8)]; } } } { E Tn, TN, TZ, T10, TO, Ty, TJ, TP; { E Tw, Te, Tu, Tr; T17 = FNMS(KP866025403, Tm, Tl); Tn = FMA(KP866025403, Tm, Tl); Tw = Td - Tc; Te = Tc + Td; Tu = Tp + Tq; Tr = Tp - Tq; TN = FMA(KP866025403, TM, TL); TV = FNMS(KP866025403, TM, TL); { E Tf, To, T1d, Tv; Tf = Tb + Te; To = FNMS(KP500000000, Te, Tb); T1d = Tt + Tu; Tv = FNMS(KP500000000, Tu, Tt); { E Ts, Tg, T1i, Tx; Ts = FMA(KP866025403, Tr, To); TZ = FNMS(KP866025403, Tr, To); Tg = Ta + Tf; T1i = Tf - Ta; Tx = FMA(KP866025403, Tw, Tv); T10 = FNMS(KP866025403, Tw, Tv); { E T1e, T1g, T1b, T1h; T1e = T1c - T1d; T1g = T1c + T1d; ro[0] = T5 + Tg; T1b = FNMS(KP500000000, Tg, T5); io[0] = T1f + T1g; T1h = FNMS(KP500000000, T1g, T1f); TO = FMA(KP176326980, Ts, Tx); Ty = FNMS(KP176326980, Tx, Ts); ro[WS(os, 6)] = FNMS(KP866025403, T1e, T1b); ro[WS(os, 3)] = FMA(KP866025403, T1e, T1b); io[WS(os, 6)] = FNMS(KP866025403, T1i, T1h); io[WS(os, 3)] = FMA(KP866025403, T1i, T1h); TJ = FNMS(KP839099631, TI, TF); TP = FMA(KP839099631, TF, TI); } } } } { E TS, TK, TU, TQ, TT, TR; TS = FMA(KP777861913, TJ, Ty); TK = FNMS(KP777861913, TJ, Ty); TU = FNMS(KP777861913, TP, TO); TQ = FMA(KP777861913, TP, TO); TT = FMA(KP492403876, TK, Tn); io[WS(os, 1)] = FNMS(KP984807753, TK, Tn); TR = FNMS(KP492403876, TQ, TN); ro[WS(os, 1)] = FMA(KP984807753, TQ, TN); io[WS(os, 4)] = FMA(KP852868531, TU, TT); io[WS(os, 7)] = FNMS(KP852868531, TU, TT); ro[WS(os, 7)] = FNMS(KP852868531, TS, TR); ro[WS(os, 4)] = FMA(KP852868531, TS, TR); T14 = FNMS(KP176326980, TW, TX); TY = FMA(KP176326980, TX, TW); T11 = FNMS(KP363970234, T10, TZ); T15 = FMA(KP363970234, TZ, T10); } } } { E T12, T1a, T16, T18, T13, T19; T12 = FNMS(KP954188894, T11, TY); T1a = FMA(KP954188894, T11, TY); T16 = FNMS(KP954188894, T15, T14); T18 = FMA(KP954188894, T15, T14); T13 = FNMS(KP492403876, T12, TV); ro[WS(os, 2)] = FMA(KP984807753, T12, TV); T19 = FMA(KP492403876, T18, T17); io[WS(os, 2)] = FNMS(KP984807753, T18, T17); ro[WS(os, 8)] = FMA(KP852868531, T16, T13); ro[WS(os, 5)] = FNMS(KP852868531, T16, T13); io[WS(os, 8)] = FMA(KP852868531, T1a, T19); io[WS(os, 5)] = FNMS(KP852868531, T1a, T19); } } }
static void n2fv_8(const R *ri, const R *ii, R *ro, R *io, stride is, stride os, INT v, INT ivs, INT ovs) { DVK(KP707106781, +0.707106781186547524400844362104849039284835938); { INT i; const R *xi; R *xo; xi = ri; xo = ro; for (i = v; i > 0; i = i - VL, xi = xi + (VL * ivs), xo = xo + (VL * ovs), MAKE_VOLATILE_STRIDE(is), MAKE_VOLATILE_STRIDE(os)) { V T1, T2, Tc, Td, T4, T5, T7, T8; T1 = LD(&(xi[0]), ivs, &(xi[0])); T2 = LD(&(xi[WS(is, 4)]), ivs, &(xi[0])); Tc = LD(&(xi[WS(is, 2)]), ivs, &(xi[0])); Td = LD(&(xi[WS(is, 6)]), ivs, &(xi[0])); T4 = LD(&(xi[WS(is, 1)]), ivs, &(xi[WS(is, 1)])); T5 = LD(&(xi[WS(is, 5)]), ivs, &(xi[WS(is, 1)])); T7 = LD(&(xi[WS(is, 7)]), ivs, &(xi[WS(is, 1)])); T8 = LD(&(xi[WS(is, 3)]), ivs, &(xi[WS(is, 1)])); { V T3, Tj, Te, Tk, T6, Tm, T9, Tn, Tp, Tl; T3 = VSUB(T1, T2); Tj = VADD(T1, T2); Te = VSUB(Tc, Td); Tk = VADD(Tc, Td); T6 = VSUB(T4, T5); Tm = VADD(T4, T5); T9 = VSUB(T7, T8); Tn = VADD(T7, T8); Tp = VSUB(Tj, Tk); Tl = VADD(Tj, Tk); { V Tq, To, Ta, Tf; Tq = VSUB(Tn, Tm); To = VADD(Tm, Tn); Ta = VADD(T6, T9); Tf = VSUB(T9, T6); { V Tr, Ts, Tt, Tu, Tg, Ti, Tb, Th; Tr = VADD(Tl, To); STM2(&(xo[0]), Tr, ovs, &(xo[0])); Ts = VSUB(Tl, To); STM2(&(xo[8]), Ts, ovs, &(xo[0])); Tt = VFMAI(Tq, Tp); STM2(&(xo[4]), Tt, ovs, &(xo[0])); Tu = VFNMSI(Tq, Tp); STM2(&(xo[12]), Tu, ovs, &(xo[0])); Tg = VFNMS(LDK(KP707106781), Tf, Te); Ti = VFMA(LDK(KP707106781), Tf, Te); Tb = VFMA(LDK(KP707106781), Ta, T3); Th = VFNMS(LDK(KP707106781), Ta, T3); { V Tv, Tw, Tx, Ty; Tv = VFMAI(Ti, Th); STM2(&(xo[6]), Tv, ovs, &(xo[2])); STN2(&(xo[4]), Tt, Tv, ovs); Tw = VFNMSI(Ti, Th); STM2(&(xo[10]), Tw, ovs, &(xo[2])); STN2(&(xo[8]), Ts, Tw, ovs); Tx = VFMAI(Tg, Tb); STM2(&(xo[14]), Tx, ovs, &(xo[2])); STN2(&(xo[12]), Tu, Tx, ovs); Ty = VFNMSI(Tg, Tb); STM2(&(xo[2]), Ty, ovs, &(xo[2])); STN2(&(xo[0]), Tr, Ty, ovs); } } } } } } VLEAVE(); }
static void hc2cb2_8(R *Rp, R *Ip, R *Rm, R *Im, const R *W, stride rs, INT mb, INT me, INT ms) { DK(KP707106781, +0.707106781186547524400844362104849039284835938); INT m; for (m = mb, W = W + ((mb - 1) * 6); m < me; m = m + 1, Rp = Rp + ms, Ip = Ip + ms, Rm = Rm - ms, Im = Im - ms, W = W + 6, MAKE_VOLATILE_STRIDE(rs)) { E Tf, Ti, TK, Tq, TH, TT, TX, TW, TY, TU, TI; { E Tg, Tl, Tp, Th, T1n, T1t, Tj; Tf = W[0]; Tg = W[2]; Tl = W[4]; Tp = W[5]; Ti = W[1]; Th = Tf * Tg; T1n = Tf * Tl; T1t = Tf * Tp; Tj = W[3]; { E T1o, T1u, Tk, T1b, To, T1e, T13, TP, T1p, T7, T1h, T1v, TZ, Tv, T1i; E TB, TA, TQ, Te, T1w, TE, T1j; { E Tr, T3, Ts, T1f, TO, TL, T6, Tt; { E TM, TN, T4, T5; { E T1, Tn, T2, TJ, Tm; T1 = Rp[0]; T1o = FMA(Ti, Tp, T1n); T1u = FNMS(Ti, Tl, T1t); Tk = FMA(Ti, Tj, Th); T1b = FNMS(Ti, Tj, Th); Tn = Tf * Tj; T2 = Rm[WS(rs, 3)]; TM = Ip[0]; TJ = Tk * Tp; Tm = Tk * Tl; To = FNMS(Ti, Tg, Tn); T1e = FMA(Ti, Tg, Tn); Tr = T1 - T2; T3 = T1 + T2; TK = FNMS(To, Tl, TJ); Tq = FMA(To, Tp, Tm); TN = Im[WS(rs, 3)]; } T4 = Rp[WS(rs, 2)]; T5 = Rm[WS(rs, 1)]; Ts = Ip[WS(rs, 2)]; T1f = TM - TN; TO = TM + TN; TL = T4 - T5; T6 = T4 + T5; Tt = Im[WS(rs, 1)]; } { E Tw, Ta, TC, Tz, Td, TD; { E Tx, Ty, Tb, Tc; { E T8, T1g, Tu, T9; T8 = Rp[WS(rs, 1)]; T13 = TO - TL; TP = TL + TO; T1p = T3 - T6; T7 = T3 + T6; T1g = Ts - Tt; Tu = Ts + Tt; T9 = Rm[WS(rs, 2)]; Tx = Ip[WS(rs, 1)]; T1h = T1f + T1g; T1v = T1f - T1g; TZ = Tr + Tu; Tv = Tr - Tu; Tw = T8 - T9; Ta = T8 + T9; Ty = Im[WS(rs, 2)]; } Tb = Rm[0]; Tc = Rp[WS(rs, 3)]; TC = Ip[WS(rs, 3)]; T1i = Tx - Ty; Tz = Tx + Ty; TB = Tb - Tc; Td = Tb + Tc; TD = Im[0]; } TA = Tw - Tz; TQ = Tw + Tz; Te = Ta + Td; T1w = Ta - Td; TE = TC + TD; T1j = TC - TD; } } { E T1x, T1k, T1r, TG, TS, T19, T15, T17, T11, T16, T12; { E T1B, T1z, T10, T1A, T1C; T1x = T1v - T1w; T1B = T1w + T1v; Rp[0] = T7 + Te; { E T1q, TR, TF, T14; T1k = T1i + T1j; T1q = T1j - T1i; TR = TB + TE; TF = TB - TE; T1r = T1p - T1q; T1z = T1p + T1q; Rm[0] = T1h + T1k; TG = TA + TF; T14 = TA - TF; TS = TQ - TR; T10 = TQ + TR; T1A = Tk * T1z; T19 = FNMS(KP707106781, T14, T13); T15 = FMA(KP707106781, T14, T13); T1C = Tk * T1B; } T17 = FMA(KP707106781, T10, TZ); T11 = FNMS(KP707106781, T10, TZ); Rp[WS(rs, 1)] = FNMS(To, T1B, T1A); T16 = Tg * T15; Rm[WS(rs, 1)] = FMA(To, T1z, T1C); } T12 = Tg * T11; { E T1l, T1a, T1c, T18; Im[WS(rs, 1)] = FMA(Tj, T11, T16); Ip[WS(rs, 1)] = FNMS(Tj, T15, T12); T18 = Tl * T17; T1l = T1h - T1k; T1a = Tl * T19; T1c = T7 - Te; Ip[WS(rs, 3)] = FNMS(Tp, T19, T18); { E T1s, T1m, T1d, T1y, TV; Im[WS(rs, 3)] = FMA(Tp, T17, T1a); T1m = T1e * T1c; T1d = T1b * T1c; T1s = T1o * T1r; Rm[WS(rs, 2)] = FMA(T1b, T1l, T1m); Rp[WS(rs, 2)] = FNMS(T1e, T1l, T1d); Rp[WS(rs, 3)] = FNMS(T1u, T1x, T1s); T1y = T1o * T1x; TV = FMA(KP707106781, TG, Tv); TH = FNMS(KP707106781, TG, Tv); TT = FNMS(KP707106781, TS, TP); TX = FMA(KP707106781, TS, TP); Rm[WS(rs, 3)] = FMA(T1u, T1r, T1y); TW = Tf * TV; TY = Ti * TV; } } } } } Ip[0] = FNMS(Ti, TX, TW); Im[0] = FMA(Tf, TX, TY); TU = TK * TH; TI = Tq * TH; Im[WS(rs, 2)] = FMA(Tq, TT, TU); Ip[WS(rs, 2)] = FNMS(TK, TT, TI); } }