VMOVAPS(ZMM(30), ZMM(2)) VMOVAPS(ZMM(31), ZMM(2)) SUB(RSI, IMM(38)) JLE(TAIL) //prefetch C into L2 #if SCATTER_PREFETCH_C VPBROADCASTD(ZMM(0), R12D) VPBROADCASTD(ZMM(1), R12D) VPMULLD(ZMM(0), ZMM(0), MEM(RDI)) VPMULLD(ZMM(1), ZMM(1), MEM(RDI,64)) ADD(RSI, IMM(30)) KXNORW(K(1), K(0), K(0)) KSHIFTRW(K(2), K(1), IMM(2)) VSCATTERPFDPS(1, MEM(RCX,ZMM(0),8) MASK_K(1)) VSCATTERPFDPS(1, MEM(RCX,ZMM(1),8) MASK_K(2)) VMOVAPD(ZMM(0), MEM(RBX)) #else PREFETCHW1(MEM(RCX )) SUBITER( 0,1,0,RAX ) PREFETCHW1(MEM(RCX,R12,1)) SUBITER( 1,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,2)) SUBITER( 2,1,0,RAX ) PREFETCHW1(MEM(RCX,R13,1)) SUBITER( 3,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,4)) SUBITER( 4,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R14,1)) SUBITER( 5,0,1,RAX,R8, 1)
VMOVAPD(ZMM( 9), ZMM(8)) MOV(RBX, VAR(b)) VMOVAPD(ZMM(10), ZMM(8)) //no ADD(RBX, IMM(4*64)) VMOVAPD(ZMM(11), ZMM(8)) //maybe? PREFETCH(0, MEM(RAX, 0)) VMOVAPD(ZMM(12), ZMM(8)) //maybe? PREFETCH(0, MEM(RAX,64)) VMOVAPD(ZMM(13), ZMM(8)) VMOVAPD(ZMM(0), MEM(RBX,0*64)) VMOVAPD(ZMM(14), ZMM(8)) VMOVAPD(ZMM(1), MEM(RBX,1*64)) VMOVAPD(ZMM(15), ZMM(8)) MOV(RCX, VAR(c)) VMOVAPD(ZMM(16), ZMM(8)) MOV(RDI, RCX) VMOVAPD(ZMM(17), ZMM(8)) VBROADCASTSS(ZMM(4), VAR(cs_c)) VMOVAPD(ZMM(18), ZMM(8)) VMOVAPS(ZMM(5), VAR(offsetPtr)) VMOVAPD(ZMM(19), ZMM(8)) VPMULLD(ZMM(4), ZMM(5), ZMM(4)) VMOVAPD(ZMM(20), ZMM(8)) MOV(RDX, IMM(0xFFF)) VMOVAPD(ZMM(21), ZMM(8)) KMOV(K(1), EDX) VMOVAPD(ZMM(22), ZMM(8)) KMOV(K(2), EDX) VMOVAPD(ZMM(23), ZMM(8)) KMOV(K(3), EDX) VMOVAPD(ZMM(24), ZMM(8)) VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8, 0) MASK_K(1)) VMOVAPD(ZMM(25), ZMM(8)) VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8, 8*8) MASK_K(2)) VMOVAPD(ZMM(26), ZMM(8)) VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8,15*8) MASK_K(3)) VMOVAPD(ZMM(27), ZMM(8)) MOV(RSI, VAR(k)) VMOVAPD(ZMM(28), ZMM(8)) SAR(RSI, IMM(2)) // rsi = k/4 VMOVAPD(ZMM(29), ZMM(8)) VMOVAPD(ZMM(30), ZMM(8)) VMOVAPD(ZMM(31), ZMM(8)) JZ(.DCONSIDKLEFT) ALIGN16 LABEL(.DLOOPKITER) VBROADCASTSD(ZMM(2), MEM(RAX, 0*8)) // Iteration 0 VBROADCASTSD(ZMM(3), MEM(RAX, 1*8))
LABEL(PACK30_G) VPBROADCASTD(ZMM(4), VAR(inca)) MOV(RBX, VAR(offsetPtr)) VPMULLD(YMM(0), YMM(4), MEM(RBX, 0)) VPMULLD(YMM(1), YMM(4), MEM(RBX,32)) VPMULLD(YMM(2), YMM(4), MEM(RBX,64)) VPMULLD(YMM(3), YMM(4), MEM(RBX,96)) LABEL(PACK30_G_LOOP) KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) KXNORW(K(3), K(0), K(0)) KSHIFTRW(K(4), K(3), IMM(10)) VGATHERDPD(ZMM(4) MASK_K(1), MEM(RAX,YMM(0),8)) VGATHERDPD(ZMM(5) MASK_K(2), MEM(RAX,YMM(1),8)) VGATHERDPD(ZMM(6) MASK_K(3), MEM(RAX,YMM(2),8)) VGATHERDPD(ZMM(7) MASK_K(4), MEM(RAX,YMM(3),8)) VMULPD(ZMM(4), ZMM(4), ZMM(31)) VMULPD(ZMM(5), ZMM(5), ZMM(31)) VMULPD(ZMM(6), ZMM(6), ZMM(31)) VMULPD(ZMM(7), ZMM(7), ZMM(31)) VMOVUPD(MEM(R15, 0), ZMM(4)) VMOVUPD(MEM(R15, 64), ZMM(5)) VMOVUPD(MEM(R15,128), ZMM(6)) VMOVUPD(MEM(R15,192), ZMM(7)) LEA(RAX, MEM(RAX,RCX,1)) LEA(R15, MEM(R15,RDI,1))
#ifdef MONITORS RDTSC MOV(VAR(midl), EAX) MOV(VAR(midh), EDX) #endif SUB(RSI, IMM(32)) JLE(TAIL) //prefetch C into L2 #if SCATTER_PREFETCH_C ADD(RSI, IMM(24)) KXNORW(K(1), K(0), K(0)) KXNORW(K(2), K(0), K(0)) VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1)) VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2)) #else PREFETCHW1(MEM(RCX )) SUBITER( 0,1,0,RAX ) PREFETCHW1(MEM(RCX,R12,1)) SUBITER( 1,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,2)) SUBITER( 2,1,0,RAX ) PREFETCHW1(MEM(RCX,R13,1)) SUBITER( 3,0,1,RAX ) PREFETCHW1(MEM(RCX,R12,4)) SUBITER( 4,1,0,RAX,R8, 1) PREFETCHW1(MEM(RCX,R14,1)) SUBITER( 5,0,1,RAX,R8, 1) PREFETCHW1(MEM(RCX,R13,2))