#include "bli_x86_asm_macros.h" void bli_sgemm_penryn_asm_8x4 ( dim_t k0, float* restrict alpha, float* restrict a, float* restrict b, float* restrict beta, float* restrict c, inc_t rs_c0, inc_t cs_c0, auxinfo_t* restrict data, cntx_t* restrict cntx ) { //void* a_next = bli_auxinfo_next_a( data ); void* b_next = bli_auxinfo_next_b( data ); // Typecast local copies of integers in case dim_t and inc_t are a // different size than is expected by load instructions. uint64_t k_iter = k0 / 4; uint64_t k_left = k0 % 4; uint64_t rs_c = rs_c0; uint64_t cs_c = cs_c0; begin_asm() mov(var(a), rax) // load address of a. mov(var(b), rbx) // load address of b. mov(var(b_next), r9) // load address of b_next.
//#define MONITORS //#define LOOPMON void bli_dgemm_asm_30x8 ( dim_t k, double* restrict alpha, double* restrict a, double* restrict b, double* restrict beta, double* restrict c, inc_t rs_c, inc_t cs_c, auxinfo_t* restrict data, cntx_t* restrict cntx ) { double * a_next = bli_auxinfo_next_a( data ); double * b_next = bli_auxinfo_next_b( data ); int * offsetPtr = &offsets[0]; uint64_t k64 = k; #ifdef MONITORS int toph, topl, both, botl, midl, midh, mid2l, mid2h; #endif #ifdef LOOPMON int tlooph, tloopl, blooph, bloopl; #endif __asm { #ifdef MONITORS