LABEL(POSTACCUM) MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSD(ZMM(0), MEM(RAX)) VBROADCASTSD(ZMM(1), MEM(RBX)) // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,8)) MOV(RBX, VAR(cs_c)) LEA(RDI, MEM(RAX,RAX,2)) CMP(RBX, IMM(1)) JNE(SCATTEREDUPDATE) VMOVQ(RDX, XMM(1)) SAL1(RDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 2, 3, 4, 5) UPDATE_C_FOUR_ROWS( 6, 7, 8, 9) UPDATE_C_FOUR_ROWS(10,11,12,13) UPDATE_C_FOUR_ROWS(14,15,16,17) UPDATE_C_FOUR_ROWS(18,19,20,21) UPDATE_C_FOUR_ROWS(22,23,24,25) UPDATE_C_FOUR_ROWS(26,27,28,29) UPDATE_C_TWO_ROWS (30,31) JMP(END) LABEL(COLSTORBZ)
#endif MOV(RAX, VAR(alpha)) MOV(RBX, VAR(beta)) VBROADCASTSS(ZMM(0), MEM(RAX)) VBROADCASTSS(ZMM(1), MEM(RBX)) // Check if C is row stride. If not, jump to the slow scattered update MOV(RAX, VAR(rs_c)) LEA(RAX, MEM(,RAX,4)) MOV(RBX, VAR(cs_c)) LEA(RDI, MEM(RAX,RAX,2)) CMP(RBX, IMM(1)) JNE(SCATTEREDUPDATE) VMOVD(EDX, XMM(1)) SAL(EDX) //shift out sign bit JZ(COLSTORBZ) UPDATE_C_FOUR_ROWS( 8, 9,10,11) UPDATE_C_FOUR_ROWS(12,13,14,15) UPDATE_C_FOUR_ROWS(16,17,18,19) UPDATE_C_FOUR_ROWS(20,21,22,23) UPDATE_C_FOUR_ROWS(24,25,26,27) UPDATE_C_FOUR_ROWS(28,29,30,31) JMP(END) LABEL(COLSTORBZ) UPDATE_C_BZ_FOUR_ROWS( 8, 9,10,11)