VMOVAPS(ZMM(30), ZMM(2))
    VMOVAPS(ZMM(31), ZMM(2))

    SUB(RSI, IMM(38))
    JLE(TAIL)

    //prefetch C into L2
#if SCATTER_PREFETCH_C
    VPBROADCASTD(ZMM(0), R12D)
    VPBROADCASTD(ZMM(1), R12D)
    VPMULLD(ZMM(0), ZMM(0), MEM(RDI))
    VPMULLD(ZMM(1), ZMM(1), MEM(RDI,64))
    ADD(RSI, IMM(30))
    KXNORW(K(1), K(0), K(0))
    KSHIFTRW(K(2), K(1), IMM(2))
    VSCATTERPFDPS(1, MEM(RCX,ZMM(0),8) MASK_K(1))
    VSCATTERPFDPS(1, MEM(RCX,ZMM(1),8) MASK_K(2))
    VMOVAPD(ZMM(0), MEM(RBX))
#else
    PREFETCHW1(MEM(RCX      ))
    SUBITER( 0,1,0,RAX      )
    PREFETCHW1(MEM(RCX,R12,1))
    SUBITER( 1,0,1,RAX      )
    PREFETCHW1(MEM(RCX,R12,2))
    SUBITER( 2,1,0,RAX      )
    PREFETCHW1(MEM(RCX,R13,1))
    SUBITER( 3,0,1,RAX      )
    PREFETCHW1(MEM(RCX,R12,4))
    SUBITER( 4,1,0,RAX,R8, 1)
    PREFETCHW1(MEM(RCX,R14,1))
    SUBITER( 5,0,1,RAX,R8, 1)
    VMOVAPD(ZMM( 9), ZMM(8))        MOV(RBX, VAR(b))
    VMOVAPD(ZMM(10), ZMM(8))        //no ADD(RBX, IMM(4*64))
    VMOVAPD(ZMM(11), ZMM(8))        //maybe? PREFETCH(0, MEM(RAX, 0))
    VMOVAPD(ZMM(12), ZMM(8))        //maybe? PREFETCH(0, MEM(RAX,64))
    VMOVAPD(ZMM(13), ZMM(8))        VMOVAPD(ZMM(0), MEM(RBX,0*64))
    VMOVAPD(ZMM(14), ZMM(8))        VMOVAPD(ZMM(1), MEM(RBX,1*64))
    VMOVAPD(ZMM(15), ZMM(8))        MOV(RCX, VAR(c))
    VMOVAPD(ZMM(16), ZMM(8))        MOV(RDI, RCX)
    VMOVAPD(ZMM(17), ZMM(8))        VBROADCASTSS(ZMM(4), VAR(cs_c))
    VMOVAPD(ZMM(18), ZMM(8))        VMOVAPS(ZMM(5), VAR(offsetPtr))
    VMOVAPD(ZMM(19), ZMM(8))        VPMULLD(ZMM(4), ZMM(5), ZMM(4))
    VMOVAPD(ZMM(20), ZMM(8))        MOV(RDX, IMM(0xFFF))
    VMOVAPD(ZMM(21), ZMM(8))        KMOV(K(1), EDX)
    VMOVAPD(ZMM(22), ZMM(8))        KMOV(K(2), EDX)
    VMOVAPD(ZMM(23), ZMM(8))        KMOV(K(3), EDX)
    VMOVAPD(ZMM(24), ZMM(8))        VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8,   0) MASK_K(1))
    VMOVAPD(ZMM(25), ZMM(8))        VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8, 8*8) MASK_K(2))
    VMOVAPD(ZMM(26), ZMM(8))        VSCATTERPFDPS(0, MEM(RCX,ZMM(4),8,15*8) MASK_K(3))
    VMOVAPD(ZMM(27), ZMM(8))        MOV(RSI, VAR(k))
    VMOVAPD(ZMM(28), ZMM(8))        SAR(RSI, IMM(2)) // rsi = k/4
    VMOVAPD(ZMM(29), ZMM(8))
    VMOVAPD(ZMM(30), ZMM(8))
    VMOVAPD(ZMM(31), ZMM(8))

    JZ(.DCONSIDKLEFT)

    ALIGN16
    LABEL(.DLOOPKITER)

    VBROADCASTSD(ZMM(2), MEM(RAX, 0*8)) // Iteration 0
    VBROADCASTSD(ZMM(3), MEM(RAX, 1*8))
Example #3
0
        LABEL(PACK30_G)

            VPBROADCASTD(ZMM(4), VAR(inca))
            MOV(RBX, VAR(offsetPtr))
            VPMULLD(YMM(0), YMM(4), MEM(RBX, 0))
            VPMULLD(YMM(1), YMM(4), MEM(RBX,32))
            VPMULLD(YMM(2), YMM(4), MEM(RBX,64))
            VPMULLD(YMM(3), YMM(4), MEM(RBX,96))

            LABEL(PACK30_G_LOOP)

                KXNORW(K(1), K(0), K(0))
                KXNORW(K(2), K(0), K(0))
                KXNORW(K(3), K(0), K(0))
                KSHIFTRW(K(4), K(3), IMM(10))
                VGATHERDPD(ZMM(4) MASK_K(1), MEM(RAX,YMM(0),8))
                VGATHERDPD(ZMM(5) MASK_K(2), MEM(RAX,YMM(1),8))
                VGATHERDPD(ZMM(6) MASK_K(3), MEM(RAX,YMM(2),8))
                VGATHERDPD(ZMM(7) MASK_K(4), MEM(RAX,YMM(3),8))
                VMULPD(ZMM(4), ZMM(4), ZMM(31))
                VMULPD(ZMM(5), ZMM(5), ZMM(31))
                VMULPD(ZMM(6), ZMM(6), ZMM(31))
                VMULPD(ZMM(7), ZMM(7), ZMM(31))
                VMOVUPD(MEM(R15,  0), ZMM(4))
                VMOVUPD(MEM(R15, 64), ZMM(5))
                VMOVUPD(MEM(R15,128), ZMM(6))
                VMOVUPD(MEM(R15,192), ZMM(7))

                LEA(RAX, MEM(RAX,RCX,1))
                LEA(R15, MEM(R15,RDI,1))
Example #4
0
#ifdef MONITORS
    RDTSC
    MOV(VAR(midl), EAX)
    MOV(VAR(midh), EDX)
#endif

    SUB(RSI, IMM(32))
    JLE(TAIL)

    //prefetch C into L2
#if SCATTER_PREFETCH_C
    ADD(RSI, IMM(24))
    KXNORW(K(1), K(0), K(0))
    KXNORW(K(2), K(0), K(0))
    VSCATTERPFDPS(1, MEM(RCX,ZMM(2),8) MASK_K(1))
    VSCATTERPFDPD(1, MEM(RCX,YMM(3),8) MASK_K(2))
#else
    PREFETCHW1(MEM(RCX      ))
    SUBITER( 0,1,0,RAX      )
    PREFETCHW1(MEM(RCX,R12,1))
    SUBITER( 1,0,1,RAX      )
    PREFETCHW1(MEM(RCX,R12,2))
    SUBITER( 2,1,0,RAX      )
    PREFETCHW1(MEM(RCX,R13,1))
    SUBITER( 3,0,1,RAX      )
    PREFETCHW1(MEM(RCX,R12,4))
    SUBITER( 4,1,0,RAX,R8, 1)
    PREFETCHW1(MEM(RCX,R14,1))
    SUBITER( 5,0,1,RAX,R8, 1)
    PREFETCHW1(MEM(RCX,R13,2))