/* Arbitrary M,K,N L0-blocked matrix matrix multiply. */
static void
mul_mdmd_md_l1_arb_all(const int M, const int K, const int N, const double *const A, const double *const B, double *const C, const int Astride, const int Bstride, const int Cstride)
{
   const double *a0,*b0;
   double *c0;
   const double *ap0_0,*ap0_1;
   const double *bp0;
   double *cp0;
   const int A_sbs_stride = Astride*2;
   const int C_sbs_stride = Cstride*2;
   const int k_marg_el = K & 1;
   const int k_norm = K - k_marg_el;
   const int m_marg_el = M & 1;
   const int m_norm = M - m_marg_el;
   const int n_marg_el = N & 1;
   const int n_norm = N - n_marg_el;
   double *const c0_endp = C+m_norm*Cstride;
   register double c00,c01,c10,c11;
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const double* const ap0_endp = a0 + k_norm;
      double* const cp0_endp = c0 + n_norm;
      for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=2,cp0+=2) {
         ap0_0 = a0;
         ap0_1 = ap0_0 + Astride;
         bp0=b0;
         LOAD2x2(c00,c01,c10,c11,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,ap0_1+=2) {
            mul_md2x2md2x2_md2x2(c00,c01,c10,c11,ap0_0,ap0_1,bp0,Bstride);
         }
         if (k_marg_el & 0x1) {
            mul_md2x1md1x2_md2x2(c00,c01,c10,c11,ap0_0,ap0_1,bp0,Bstride);
         }
         STORE2x2(c00,c01,c10,c11,cp0,Cstride);
      }
   }
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const double* const ap0_endp = a0 + k_norm;
      b0 = B+n_norm;
      cp0 = c0+n_norm;
      if (n_marg_el & 0x1) {
         ap0_0 = a0;
         ap0_1 = ap0_0 + Astride;
         bp0=b0;
         LOAD2x1(c00,c10,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,ap0_1+=2) {
            mul_md2x2md2x1_md2x1(c00,c10,ap0_0,ap0_1,bp0,Bstride);
         }
         if (k_marg_el & 0x1) {
            mul_md2x1md1x1_md2x1(c00,c10,ap0_0,ap0_1,bp0,Bstride);
         }
         STORE2x1(c00,c10,cp0,Cstride);
      }
   }
   if (m_marg_el & 0x1) {
      const double* const ap0_endp = a0 + k_norm;
      double* const cp0_endp = c0 + n_norm;
      for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=2,cp0+=2) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x2(c00,c01,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2) {
            mul_md1x2md2x2_md1x2(c00,c01,ap0_0,bp0,Bstride);
         }
         if (k_marg_el & 0x1) {
            mul_md1x1md1x2_md1x2(c00,c01,ap0_0,bp0,Bstride);
         }
         STORE1x2(c00,c01,cp0,Cstride);
      }
      if (n_marg_el & 0x1) {
         ap0_0 = a0;
         bp0=b0;
         LOAD1x1(c00,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2) {
            mul_md1x2md2x1_md1x1(c00,ap0_0,bp0,Bstride);
         }
         if (k_marg_el & 0x1) {
            mul_md1x1md1x1_md1x1(c00,ap0_0,bp0,Bstride);
         }
         STORE1x1(c00,cp0,Cstride);
      }
   }
}
Example #2
0
/* Arbitrary M,K,N L0-blocked matrix matrix multiply. */
static void
mul_mfmft_mf_beta0_l1_arb_all(const int M, const int K, const int N, const float *const A, const float *const B, float *const C, const int Astride, const int Bstride, const int Cstride)
{
   const float *a0,*b0;
   float *c0;
   const float *ap0_0;
   const float *bp0_0,*bp0_1,*bp0_2,*bp0_3,*bp0_4,*bp0_5,*bp0_6,*bp0_7,*bp0_8,*bp0_9;
   float *cp0;
   const int A_sbs_stride = Astride*1;
   const int B_sbs_stride = Bstride*10;
   const int C_sbs_stride = Cstride*1;
   const int k_marg_el = K & 1;
   const int k_norm = K - k_marg_el;
   const int m_marg_el = M & 0;
   const int m_norm = M - m_marg_el;
   const int n_marg_el = N % 10;
   const int n_norm = N - n_marg_el;
   float *const c0_endp = C+m_norm*Cstride;
   register float c00,c01,c02,c03,c04,c05,c06,c07,c08,c09;
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const float* const ap0_endp = a0 + k_norm;
      float* const cp0_endp = c0 + n_norm;
      for (b0=B,cp0=c0; cp0!=cp0_endp; b0+=B_sbs_stride,cp0+=10) {
         ap0_0 = a0;
         bp0_0 = b0;
         bp0_1 = bp0_0 + Bstride;
         bp0_2 = bp0_1 + Bstride;
         bp0_3 = bp0_2 + Bstride;
         bp0_4 = bp0_3 + Bstride;
         bp0_5 = bp0_4 + Bstride;
         bp0_6 = bp0_5 + Bstride;
         bp0_7 = bp0_6 + Bstride;
         bp0_8 = bp0_7 + Bstride;
         bp0_9 = bp0_8 + Bstride;
         LOAD1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2,bp0_2+=2,bp0_3+=2,bp0_4+=2,bp0_5+=2,bp0_6+=2,bp0_7+=2,bp0_8+=2,bp0_9+=2) {
            mul_mf1x2tmf2x10_mf1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7,bp0_8,bp0_9);
         }
         if (k_marg_el & 0x1) {
            mul_mf1x1tmf1x10_mf1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7,bp0_8,bp0_9);
         }
         STORE1x10(c00,c01,c02,c03,c04,c05,c06,c07,c08,c09,cp0,Cstride);
      }
   }
   for (c0=C,a0=A; c0!= c0_endp; c0+=C_sbs_stride,a0+=A_sbs_stride) {
      const float* const ap0_endp = a0 + k_norm;
      b0 = B+n_norm*Bstride;
      cp0 = c0+n_norm;
      if (n_marg_el & 0x8) {
         ap0_0 = a0;
         bp0_0 = b0;
         bp0_1 = bp0_0 + Bstride;
         bp0_2 = bp0_1 + Bstride;
         bp0_3 = bp0_2 + Bstride;
         bp0_4 = bp0_3 + Bstride;
         bp0_5 = bp0_4 + Bstride;
         bp0_6 = bp0_5 + Bstride;
         bp0_7 = bp0_6 + Bstride;
         LOAD1x8(c00,c01,c02,c03,c04,c05,c06,c07,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2,bp0_2+=2,bp0_3+=2,bp0_4+=2,bp0_5+=2,bp0_6+=2,bp0_7+=2) {
            mul_mf1x2tmf2x8_mf1x8(c00,c01,c02,c03,c04,c05,c06,c07,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7);
         }
         if (k_marg_el & 0x1) {
            mul_mf1x1tmf1x8_mf1x8(c00,c01,c02,c03,c04,c05,c06,c07,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3,bp0_4,bp0_5,bp0_6,bp0_7);
         }
         STORE1x8(c00,c01,c02,c03,c04,c05,c06,c07,cp0,Cstride);
         b0 += Bstride*8;
         cp0 += 8;
      }
      if (n_marg_el & 0x4) {
         ap0_0 = a0;
         bp0_0 = b0;
         bp0_1 = bp0_0 + Bstride;
         bp0_2 = bp0_1 + Bstride;
         bp0_3 = bp0_2 + Bstride;
         LOAD1x4(c00,c01,c02,c03,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2,bp0_2+=2,bp0_3+=2) {
            mul_mf1x2tmf2x4_mf1x4(c00,c01,c02,c03,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3);
         }
         if (k_marg_el & 0x1) {
            mul_mf1x1tmf1x4_mf1x4(c00,c01,c02,c03,ap0_0,bp0_0,bp0_1,bp0_2,bp0_3);
         }
         STORE1x4(c00,c01,c02,c03,cp0,Cstride);
         b0 += Bstride*4;
         cp0 += 4;
      }
      if (n_marg_el & 0x2) {
         ap0_0 = a0;
         bp0_0 = b0;
         bp0_1 = bp0_0 + Bstride;
         LOAD1x2(c00,c01,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2,bp0_1+=2) {
            mul_mf1x2tmf2x2_mf1x2(c00,c01,ap0_0,bp0_0,bp0_1);
         }
         if (k_marg_el & 0x1) {
            mul_mf1x1tmf1x2_mf1x2(c00,c01,ap0_0,bp0_0,bp0_1);
         }
         STORE1x2(c00,c01,cp0,Cstride);
         b0 += Bstride*2;
         cp0 += 2;
      }
      if (n_marg_el & 0x1) {
         ap0_0 = a0;
         bp0_0 = b0;
         LOAD1x1(c00,cp0,Cstride);
         for (; ap0_0!=ap0_endp; ap0_0+=2,bp0_0+=2) {
            mul_mf1x2tmf2x1_mf1x1(c00,ap0_0,bp0_0);
         }
         if (k_marg_el & 0x1) {
            mul_mf1x1tmf1x1_mf1x1(c00,ap0_0,bp0_0);
         }
         STORE1x1(c00,cp0,Cstride);
      }
   }
}