void MY_MMult( int m, int n, int k, double *a, int lda, double *b, int ldb, double *c, int ldc ) { int i, j; for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */ for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */ /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in one routine (four inner products) */ AddDot4x4( k, &A( i,0 ), lda, &B( 0,j ), ldb, &C( i,j ), ldc ); } } }
void InnerKernel( int m, int n, int k, double *a, int lda, double *b, int ldb, double *c, int ldc, int first_time ) { int i, j; double packedA[ m * k ]; static double packedB[ kc*nb ]; /* Note: using a static buffer is not thread safe... */ for ( j=0; j<n; j+=4 ){ /* Loop over the columns of C, unrolled by 4 */ if ( first_time ) PackMatrixB( k, &B( 0, j ), ldb, &packedB[ j*k ] ); for ( i=0; i<m; i+=4 ){ /* Loop over the rows of C */ /* Update C( i,j ), C( i,j+1 ), C( i,j+2 ), and C( i,j+3 ) in one routine (four inner products) */ if ( j == 0 ) PackMatrixA( k, &A( i, 0 ), lda, &packedA[ i*k ] ); AddDot4x4( k, &packedA[ i*k ], 4, &packedB[ j*k ], k, &C( i,j ), ldc ); } } }