/* Arbitrary M,K,N L0-blocked matrix matrix multiply. */ static void mul_mdmd_md_l1_arb_all(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride) { const double *a0,*b0; double *c0; const double *ap0_0,*ap0_1; const double *bp0; double *cp0; const int A_sbs_stride = Astride*2; const int C_sbs_stride = Cstride*2; const int k_marg_el = K & 1; const int k_norm = K - k_marg_el; const int m_marg_el = M & 1; const int m_norm = M - m_marg_el; const int n_marg_el = N & 1; const int n_norm = N - n_marg_el; double *const c0_endp = C+m_norm*Cstride; register double c00,c01,c10,c11; for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) { const double* const ap0_endp = a0 + k_norm; double* const cp0_endp = c0 + n_norm; for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=2,cp0+=2) { ap0_0 = a0; ap0_1 = ap0_0 + Astride; bp0=b0; LOAD2x2(c00,c01,c10,c11,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,ap0_1+=2) { mul_md2x2md2x2_md2x2(c00,c01,c10,c11,ap0_0,ap0_1,bp0,Bstride); } if (k_marg_el & 0x1) { mul_md2x1md1x2_md2x2(c00,c01,c10,c11,ap0_0,ap0_1,bp0,Bstride); } STORE2x2(c00,c01,c10,c11,cp0,Cstride); } } for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) { const double* const ap0_endp = a0 + k_norm; b0 = B+n_norm; cp0 = c0+n_norm; if (n_marg_el & 0x1) { ap0_0 = a0; ap0_1 = ap0_0 + Astride; bp0=b0; LOAD2x1(c00,c10,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,ap0_1+=2) { mul_md2x2md2x1_md2x1(c00,c10,ap0_0,ap0_1,bp0,Bstride); } if (k_marg_el & 0x1) { mul_md2x1md1x1_md2x1(c00,c10,ap0_0,ap0_1,bp0,Bstride); } STORE2x1(c00,c10,cp0,Cstride); } } if (m_marg_el & 0x1) { const double* const ap0_endp = a0 + k_norm; double* const cp0_endp = c0 + n_norm; for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=2,cp0+=2) { ap0_0 = a0; bp0=b0; LOAD1x2(c00,c01,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2) { mul_md1x2md2x2_md1x2(c00,c01,ap0_0,bp0,Bstride); } if (k_marg_el & 0x1) { mul_md1x1md1x2_md1x2(c00,c01,ap0_0,bp0,Bstride); } STORE1x2(c00,c01,cp0,Cstride); } if (n_marg_el & 0x1) { ap0_0 = a0; bp0=b0; LOAD1x1(c00,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2) { mul_md1x2md2x1_md1x1(c00,ap0_0,bp0,Bstride); } if (k_marg_el & 0x1) { mul_md1x1md1x1_md1x1(c00,ap0_0,bp0,Bstride); } STORE1x1(c00,cp0,Cstride); } } }
/* Arbitrary M,K,N L0-blocked matrix matrix multiply. */ static void mul_mfmft_mf_beta0_l1_arb_all(const int M, const int K, const int N, const float *const A, const float *const B, float *const C, const int Astride, const int Bstride, const int Cstride) { const float *a0,*b0; float *c0; const float *ap0_0; const float *bp0_0,*bp0_1,*bp0_2,*bp0_3,*bp0_4,*bp0_5,*bp0_6,*bp0_7,*bp0_8,*bp0_9; float *cp0; const int A_sbs_stride = Astride*1; const int B_sbs_stride = Bstride*10; const int C_sbs_stride = Cstride*1; const int k_marg_el = K & 1; const int k_norm = K - k_marg_el; const int m_marg_el = M & 0; const int m_norm = M - m_marg_el; const int n_marg_el = N % 10; const int n_norm = N - n_marg_el; float *const c0_endp = C+m_norm*Cstride; register float c00,c01,c02,c03,c04,c05,c06,c07,c08,c09; for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) { const float* const ap0_endp = a0 + k_norm; float* const cp0_endp = c0 + n_norm; for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=B_sbs_stride,cp0+=10) { ap0_0 = a0; bp0_0 = b0; bp0_1 = bp0_0 + Bstride; bp0_2 = bp0_1 + Bstride; bp0_3 = bp0_2 + Bstride; bp0_4 = bp0_3 + Bstride; bp0_5 = bp0_4 + Bstride; bp0_6 = bp0_5 + Bstride; bp0_7 = bp0_6 + Bstride; bp0_8 = bp0_7 + Bstride; bp0_9 = bp0_8 + Bstride; LOAD1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2,bp0_2+=2,bp0_3+=2,bp0_4+=2,bp0_5+=2,bp0_6+=2,bp0_7+=2,bp0_8+=2,bp0_9+=2) { mul_mf1x2tmf2x10_mf1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7,bp0_8,bp0_9); } if (k_marg_el & 0x1) { mul_mf1x1tmf1x10_mf1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7,bp0_8,bp0_9); } STORE1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride); } } for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) { const float* const ap0_endp = a0 + k_norm; b0 = B+n_norm*Bstride; cp0 = c0+n_norm; if (n_marg_el & 0x8) { ap0_0 = a0; bp0_0 = b0; bp0_1 = bp0_0 + Bstride; bp0_2 = bp0_1 + Bstride; bp0_3 = bp0_2 + Bstride; bp0_4 = bp0_3 + Bstride; bp0_5 = bp0_4 + Bstride; bp0_6 = bp0_5 + Bstride; bp0_7 = bp0_6 + Bstride; LOAD1x8(c00,c01,c02,c03,c04,c05,c06,c07,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2,bp0_2+=2,bp0_3+=2,bp0_4+=2,bp0_5+=2,bp0_6+=2,bp0_7+=2) { mul_mf1x2tmf2x8_mf1x8(c00,c01,c02,c03,c04,c05,c06,c07,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7); } if (k_marg_el & 0x1) { mul_mf1x1tmf1x8_mf1x8(c00,c01,c02,c03,c04,c05,c06,c07,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7); } STORE1x8(c00,c01,c02,c03,c04,c05,c06,c07,cp0,Cstride); b0 += Bstride*8; cp0 += 8; } if (n_marg_el & 0x4) { ap0_0 = a0; bp0_0 = b0; bp0_1 = bp0_0 + Bstride; bp0_2 = bp0_1 + Bstride; bp0_3 = bp0_2 + Bstride; LOAD1x4(c00,c01,c02,c03,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2,bp0_2+=2,bp0_3+=2) { mul_mf1x2tmf2x4_mf1x4(c00,c01,c02,c03,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3); } if (k_marg_el & 0x1) { mul_mf1x1tmf1x4_mf1x4(c00,c01,c02,c03,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3); } STORE1x4(c00,c01,c02,c03,cp0,Cstride); b0 += Bstride*4; cp0 += 4; } if (n_marg_el & 0x2) { ap0_0 = a0; bp0_0 = b0; bp0_1 = bp0_0 + Bstride; LOAD1x2(c00,c01,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2) { mul_mf1x2tmf2x2_mf1x2(c00,c01,ap0_0,bp0_0,bp0_1); } if (k_marg_el & 0x1) { mul_mf1x1tmf1x2_mf1x2(c00,c01,ap0_0,bp0_0,bp0_1); } STORE1x2(c00,c01,cp0,Cstride); b0 += Bstride*2; cp0 += 2; } if (n_marg_el & 0x1) { ap0_0 = a0; bp0_0 = b0; LOAD1x1(c00,cp0,Cstride); for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2) { mul_mf1x2tmf2x1_mf1x1(c00,ap0_0,bp0_0); } if (k_marg_el & 0x1) { mul_mf1x1tmf1x1_mf1x1(c00,ap0_0,bp0_0); } STORE1x1(c00,cp0,Cstride); } } }