Beispiel #1
0
void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const u08b_t *blkPtr,size_t blkCnt,size_t byteCntAdd)
{   /* do it in C */
    enum
    {
        WCNT = SKEIN_512_STATE_WORDS
    };
#undef  RCNT
#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)

#ifdef  SKEIN_LOOP                              /* configure how much to unroll the loop */
#define SKEIN_UNROLL_512 (((SKEIN_LOOP)/10)%10)
#else
#define SKEIN_UNROLL_512 (0)
#endif

#if SKEIN_UNROLL_512
#if (RCNT % SKEIN_UNROLL_512)
#error "Invalid SKEIN_UNROLL_512"               /* sanity check on unroll count */
#endif
    size_t  r;
    u64b_t  kw[WCNT+4+RCNT*2];                  /* key schedule words : chaining vars + tweak + "rotation"*/
#else
    u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
#endif
    u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
    u64b_t  w [WCNT];                           /* local copy of input block */
#ifdef SKEIN_DEBUG
    const u64b_t *Xptr[8];                      /* use for debugging (help compiler put Xn in registers) */
    Xptr[0] = &X0;
    Xptr[1] = &X1;
    Xptr[2] = &X2;
    Xptr[3] = &X3;
    Xptr[4] = &X4;
    Xptr[5] = &X5;
    Xptr[6] = &X6;
    Xptr[7] = &X7;
#endif

    Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
    ts[0] = ctx->h.T[0];
    ts[1] = ctx->h.T[1];
    do  {
        /* this implementation only supports 2**64 input bytes (no carry out here) */
        ts[0] += byteCntAdd;                    /* update processed length */

        /* precompute the key schedule for this block */
        ks[0] = ctx->X[0];
        ks[1] = ctx->X[1];
        ks[2] = ctx->X[2];
        ks[3] = ctx->X[3];
        ks[4] = ctx->X[4];
        ks[5] = ctx->X[5];
        ks[6] = ctx->X[6];
        ks[7] = ctx->X[7];
        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;

        ts[2] = ts[0] ^ ts[1];

        Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */
        DebugSaveTweak(ctx);
        Skein_Show_Block(BLK_BITS,&ctx->h,ctx->X,blkPtr,w,ks,ts);

        X0   = w[0] + ks[0];                    /* do the first full key injection */
        X1   = w[1] + ks[1];
        X2   = w[2] + ks[2];
        X3   = w[3] + ks[3];
        X4   = w[4] + ks[4];
        X5   = w[5] + ks[5] + ts[0];
        X6   = w[6] + ks[6] + ts[1];
        X7   = w[7] + ks[7];

        blkPtr += SKEIN_512_BLOCK_BYTES;

        Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INITIAL,Xptr);
        /* run the rounds */
#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \

#if SKEIN_UNROLL_512 == 0
#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,rNum,Xptr);

#define I512(R)                                                     \
    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
    X1   += ks[((R)+2) % 9];                                        \
    X2   += ks[((R)+3) % 9];                                        \
    X3   += ks[((R)+4) % 9];                                        \
    X4   += ks[((R)+5) % 9];                                        \
    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
    X7   += ks[((R)+8) % 9] +     (R)+1;                            \
    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);
#else                                       /* looping version */
#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,4*(r-1)+rNum,Xptr);

#define I512(R)                                                     \
    X0   += ks[r+(R)+0];        /* inject the key schedule value */ \
    X1   += ks[r+(R)+1];                                            \
    X2   += ks[r+(R)+2];                                            \
    X3   += ks[r+(R)+3];                                            \
    X4   += ks[r+(R)+4];                                            \
    X5   += ks[r+(R)+5] + ts[r+(R)+0];                              \
    X6   += ks[r+(R)+6] + ts[r+(R)+1];                              \
    X7   += ks[r+(R)+7] +    r+(R)   ;                              \
    ks[r +       (R)+8] = ks[r+(R)-1];  /* rotate key schedule */   \
    ts[r +       (R)+2] = ts[r+(R)-1];                              \
    Skein_Show_R_Ptr(BLK_BITS,&ctx->h,SKEIN_RND_KEY_INJECT,Xptr);

        for (r=1; r < 2*RCNT; r+=2*SKEIN_UNROLL_512) /* loop thru it */
#endif                         /* end of looped code definitions */
        {
#define R512_8_rounds(R)  /* do 8 full rounds */  \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);        /* and key injection */

            R512_8_rounds( 0);

#define R512_Unroll_R(NN) ((SKEIN_UNROLL_512 == 0 && SKEIN_512_ROUNDS_TOTAL/8 > (NN)) || (SKEIN_UNROLL_512 > (NN)))

#if   R512_Unroll_R( 1)
            R512_8_rounds( 1);
#endif
#if   R512_Unroll_R( 2)
            R512_8_rounds( 2);
#endif
#if   R512_Unroll_R( 3)
            R512_8_rounds( 3);
#endif
#if   R512_Unroll_R( 4)
            R512_8_rounds( 4);
#endif
#if   R512_Unroll_R( 5)
            R512_8_rounds( 5);
#endif
#if   R512_Unroll_R( 6)
            R512_8_rounds( 6);
#endif
#if   R512_Unroll_R( 7)
            R512_8_rounds( 7);
#endif
#if   R512_Unroll_R( 8)
            R512_8_rounds( 8);
#endif
#if   R512_Unroll_R( 9)
            R512_8_rounds( 9);
#endif
#if   R512_Unroll_R(10)
            R512_8_rounds(10);
#endif
#if   R512_Unroll_R(11)
            R512_8_rounds(11);
#endif
#if   R512_Unroll_R(12)
            R512_8_rounds(12);
#endif
#if   R512_Unroll_R(13)
            R512_8_rounds(13);
#endif
#if   R512_Unroll_R(14)
            R512_8_rounds(14);
#endif
#if  (SKEIN_UNROLL_512 > 14)
#error  "need more unrolling in Skein_512_Process_Block"
#endif
        }

        /* do the final "feedforward" xor, update context chaining vars */
        ctx->X[0] = X0 ^ w[0];
        ctx->X[1] = X1 ^ w[1];
        ctx->X[2] = X2 ^ w[2];
        ctx->X[3] = X3 ^ w[3];
        ctx->X[4] = X4 ^ w[4];
        ctx->X[5] = X5 ^ w[5];
        ctx->X[6] = X6 ^ w[6];
        ctx->X[7] = X7 ^ w[7];
        Skein_Show_Round(BLK_BITS,&ctx->h,SKEIN_RND_FEED_FWD,ctx->X);

        ts[1] &= ~SKEIN_T1_FLAG_FIRST;
    }
    while (--blkCnt);
    ctx->h.T[0] = ts[0];
    ctx->h.T[1] = ts[1];
}
Beispiel #2
0
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd)
{
    __m64  kw[12];                         /* key schedule words : chaining vars + tweak */
    __m64  X0,X1,X2,X3,X4,X5,X6,X7;        /* local copy of vars, for speed */
    __m64  w[8];                          /* local copy of input block */
    __m64  z1;
    __m64  z3;
    __m64  z5;
    __m64  z7;

    ts[0] = ctx->T[0];
    ts[1] = ctx->T[1];
    do {
        ts[0] = _mm_add_si64(ts[0],_mm_set_pi32(0,byteCntAdd));

	z1 = SKEIN_KS_PARITY;
        ks[0] = ctx->X[0];
	z1 = _mm_xor_si64(z1,ks[0]);
        ks[1] = ctx->X[1];
	z1 = _mm_xor_si64(z1,ks[1]);
        ks[2] = ctx->X[2];
	z1 = _mm_xor_si64(z1,ks[2]);
        ks[3] = ctx->X[3];
	z1 = _mm_xor_si64(z1,ks[3]);
        ks[4] = ctx->X[4];
	z1 = _mm_xor_si64(z1,ks[4]);
        ks[5] = ctx->X[5];
	z1 = _mm_xor_si64(z1,ks[5]);
        ks[6] = ctx->X[6];
	z1 = _mm_xor_si64(z1,ks[6]);
        ks[7] = ctx->X[7];
	z1 = _mm_xor_si64(z1,ks[7]);
	ks[8] = z1;

        ts[2] = _mm_xor_si64(ts[0],ts[1]);

        X0 = 0[(__m64 *) blkPtr];
        X1 = 1[(__m64 *) blkPtr];
        X2 = 2[(__m64 *) blkPtr];
        X3 = 3[(__m64 *) blkPtr];
        X4 = 4[(__m64 *) blkPtr];
        X5 = 5[(__m64 *) blkPtr];
        X6 = 6[(__m64 *) blkPtr];
        X7 = 7[(__m64 *) blkPtr];

        w[0] = X0;
        w[1] = X1;
        w[2] = X2;
        w[3] = X3;
        w[4] = X4;
        w[5] = X5;
        w[6] = X6;
        w[7] = X7;

        X0 = _mm_add_si64(X0,ks[0]);
        X1 = _mm_add_si64(X1,ks[1]);
        X2 = _mm_add_si64(X2,ks[2]);
        X3 = _mm_add_si64(X3,ks[3]);
        X4 = _mm_add_si64(X4,ks[4]);
        X5 = _mm_add_si64(X5,_mm_add_si64(ks[5],ts[0]));
        X6 = _mm_add_si64(X6,_mm_add_si64(ks[6],ts[1]));
        X7 = _mm_add_si64(X7,ks[7]);

        blkPtr += 64;

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    X##p0 = _mm_add_si64(X##p0,X##p1); \
      X##p2 = _mm_add_si64(X##p2,X##p3); \
        X##p4 = _mm_add_si64(X##p4,X##p5); \
          X##p6 = _mm_add_si64(X##p6,X##p7); \
    z1 = X##p1; \
    X##p1 = _m_psrlqi(X##p1,64-ROT##_0); \
    z1 = _m_psllqi(z1,ROT##_0); \
    X##p1 = _mm_or_si64(X##p1,z1); \
      z3 = X##p3; \
      X##p3 = _m_psrlqi(X##p3,64-ROT##_1); \
      z3 = _m_psllqi(z3,ROT##_1); \
      X##p3 = _mm_or_si64(X##p3,z3); \
        z5 = X##p5; \
        z5 = _m_psllqi(z5,ROT##_2); \
        X##p5 = _m_psrlqi(X##p5,64-ROT##_2); \
        X##p5 = _mm_or_si64(X##p5,z5); \
          z7 = X##p7; \
          X##p7 = _m_psrlqi(X##p7,64-ROT##_3); \
          z7 = _m_psllqi(z7,ROT##_3); \
          X##p7 = _mm_or_si64(X##p7,z7); \
    X##p1 = _mm_xor_si64(X##p1,X##p0); \
      X##p3 = _mm_xor_si64(X##p3,X##p2); \
        X##p5 = _mm_xor_si64(X##p5,X##p4); \
          X##p7 = _mm_xor_si64(X##p7,X##p6); \

#define I512(R)                                                     \
    X0 = _mm_add_si64(X0,ks[((R)+1) % 9]);   /* inject the key schedule value */  \
    X1 = _mm_add_si64(X1,ks[((R)+2) % 9]);                                        \
    X2 = _mm_add_si64(X2,ks[((R)+3) % 9]);                                        \
    X3 = _mm_add_si64(X3,ks[((R)+4) % 9]);                                        \
    X4 = _mm_add_si64(X4,ks[((R)+5) % 9]);                                        \
    X5 = _mm_add_si64(X5,_mm_add_si64(ks[((R)+6) % 9],ts[((R)+1) % 3]));          \
    X6 = _mm_add_si64(X6,_mm_add_si64(ks[((R)+7) % 9],ts[((R)+2) % 3]));          \
    X7 = _mm_add_si64(X7,_mm_add_si64(ks[((R)+8) % 9],_mm_set_pi32(0,(R)+1)));     \

#define R512_8_rounds(R) \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);

        R512_8_rounds( 0);
        R512_8_rounds( 1);
        R512_8_rounds( 2);
        R512_8_rounds( 3);
        R512_8_rounds( 4);
        R512_8_rounds( 5);
        R512_8_rounds( 6);
        R512_8_rounds( 7);
        R512_8_rounds( 8);

        ctx->X[0] = _mm_xor_si64(X0,w[0]);
        ctx->X[1] = _mm_xor_si64(X1,w[1]);
        ctx->X[2] = _mm_xor_si64(X2,w[2]);
        ctx->X[3] = _mm_xor_si64(X3,w[3]);
        ctx->X[4] = _mm_xor_si64(X4,w[4]);
        ctx->X[5] = _mm_xor_si64(X5,w[5]);
        ctx->X[6] = _mm_xor_si64(X6,w[6]);
        ctx->X[7] = _mm_xor_si64(X7,w[7]);

        ts[1] = _mm_and_si64(ts[1],_mm_set_pi32(~(((uint32)  64 ) << 24),~0));
    } while (--blkCnt);
    ctx->T[0] = ts[0];
    ctx->T[1] = ts[1];
}
Beispiel #3
0
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx, const u08b_t *blkPtr,
                             size_t blkCnt, size_t byteCntAdd) {
  enum {
      WCNT = SKEIN_512_STATE_WORDS
  };
#define RCNT  (SKEIN_512_ROUNDS_TOTAL/8)

  u64b_t  kw[WCNT+4];                         /* key schedule words : chaining vars + tweak */
  u64b_t  X0,X1,X2,X3,X4,X5,X6,X7;            /* local copy of vars, for speed */
  u64b_t  w [WCNT];                           /* local copy of input block */

  Skein_assert(blkCnt != 0);                  /* never call with blkCnt == 0! */
  ts[0] = ctx->h.T[0];
  ts[1] = ctx->h.T[1];
  do  {
        /* this implementation only supports 2**64 input bytes (no carry out here) */
    ts[0] += byteCntAdd;                    /* update processed length */

    /* precompute the key schedule for this block */
    ks[0] = ctx->X[0];
    ks[1] = ctx->X[1];
    ks[2] = ctx->X[2];
    ks[3] = ctx->X[3];
    ks[4] = ctx->X[4];
    ks[5] = ctx->X[5];
    ks[6] = ctx->X[6];
    ks[7] = ctx->X[7];
    ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^
      ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;

    ts[2] = ts[0] ^ ts[1];

    Skein_Get64_LSB_First(w,blkPtr,WCNT); /* get input block in little-endian format */

    X0   = w[0] + ks[0];                    /* do the first full key injection */
    X1   = w[1] + ks[1];
    X2   = w[2] + ks[2];
    X3   = w[3] + ks[3];
    X4   = w[4] + ks[4];
    X5   = w[5] + ks[5] + ts[0];
    X6   = w[6] + ks[6] + ts[1];
    X7   = w[7] + ks[7];

    blkPtr += SKEIN_512_BLOCK_BYTES;

    /* run the rounds */
#define Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                  \
    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)      /* unrolled */  \
    Round512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)

#define I512(R)                                                     \
    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
    X1   += ks[((R)+2) % 9];                                        \
    X2   += ks[((R)+3) % 9];                                        \
    X3   += ks[((R)+4) % 9];                                        \
    X4   += ks[((R)+5) % 9];                                        \
    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
    X7   += ks[((R)+8) % 9] +     (R)+1;

    {

#define R512_8_rounds(R)  /* do 8 full rounds */  \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);        /* and key injection */

      R512_8_rounds( 0);

#define R512_Unroll_R(NN) (SKEIN_512_ROUNDS_TOTAL/8 > (NN))

  #if   R512_Unroll_R( 1)
      R512_8_rounds( 1);
  #endif
  #if   R512_Unroll_R( 2)
      R512_8_rounds( 2);
  #endif
  #if   R512_Unroll_R( 3)
      R512_8_rounds( 3);
  #endif
  #if   R512_Unroll_R( 4)
      R512_8_rounds( 4);
  #endif
  #if   R512_Unroll_R( 5)
      R512_8_rounds( 5);
  #endif
  #if   R512_Unroll_R( 6)
      R512_8_rounds( 6);
  #endif
  #if   R512_Unroll_R( 7)
      R512_8_rounds( 7);
  #endif
  #if   R512_Unroll_R( 8)
      R512_8_rounds( 8);
  #endif
  #if   R512_Unroll_R( 9)
      R512_8_rounds( 9);
  #endif
  #if   R512_Unroll_R(10)
      R512_8_rounds(10);
  #endif
  #if   R512_Unroll_R(11)
      R512_8_rounds(11);
  #endif
  #if   R512_Unroll_R(12)
      R512_8_rounds(12);
  #endif
  #if   R512_Unroll_R(13)
      R512_8_rounds(13);
  #endif
  #if   R512_Unroll_R(14)
      R512_8_rounds(14);
  #endif
    }

    /* do the final "feedforward" xor, update context chaining vars */
    ctx->X[0] = X0 ^ w[0];
    ctx->X[1] = X1 ^ w[1];
    ctx->X[2] = X2 ^ w[2];
    ctx->X[3] = X3 ^ w[3];
    ctx->X[4] = X4 ^ w[4];
    ctx->X[5] = X5 ^ w[5];
    ctx->X[6] = X6 ^ w[6];
    ctx->X[7] = X7 ^ w[7];

    ts[1] &= ~SKEIN_T1_FLAG_FIRST;
  }
  while (--blkCnt);
  ctx->h.T[0] = ts[0];
  ctx->h.T[1] = ts[1];
}
Beispiel #4
0
static void Skein_512_Process_Block(Skein_512_Ctxt_t *ctx,const uint8 *blkPtr,size_t blkCnt,size_t byteCntAdd)
{
    uint64  kw[12];                         /* key schedule words : chaining vars + tweak */
    uint64  X0,X1,X2,X3,X4,X5,X6,X7;        /* local copy of vars, for speed */
    uint64  w[8];                          /* local copy of input block */

    ts[0] = ctx->T[0];
    ts[1] = ctx->T[1];
    do {
        ts[0] += byteCntAdd;

        ks[0] = ctx->X[0];
        ks[1] = ctx->X[1];
        ks[2] = ctx->X[2];
        ks[3] = ctx->X[3];
        ks[4] = ctx->X[4];
        ks[5] = ctx->X[5];
        ks[6] = ctx->X[6];
        ks[7] = ctx->X[7];
        ks[8] = ks[0] ^ ks[1] ^ ks[2] ^ ks[3] ^ 
                ks[4] ^ ks[5] ^ ks[6] ^ ks[7] ^ SKEIN_KS_PARITY;

        ts[2] = ts[0] ^ ts[1];

        Skein_Get64_LSB_First(w,blkPtr,8);

        X0   = w[0] + ks[0];
        X1   = w[1] + ks[1];
        X2   = w[2] + ks[2];
        X3   = w[3] + ks[3];
        X4   = w[4] + ks[4];
        X5   = w[5] + ks[5] + ts[0];
        X6   = w[6] + ks[6] + ts[1];
        X7   = w[7] + ks[7];

        blkPtr += 64;

#define R512(p0,p1,p2,p3,p4,p5,p6,p7,ROT,rNum)                      \
    X##p0 += X##p1; X##p1 = RotL_64(X##p1,ROT##_0); X##p1 ^= X##p0; \
    X##p2 += X##p3; X##p3 = RotL_64(X##p3,ROT##_1); X##p3 ^= X##p2; \
    X##p4 += X##p5; X##p5 = RotL_64(X##p5,ROT##_2); X##p5 ^= X##p4; \
    X##p6 += X##p7; X##p7 = RotL_64(X##p7,ROT##_3); X##p7 ^= X##p6; \

#define I512(R)                                                     \
    X0   += ks[((R)+1) % 9];   /* inject the key schedule value */  \
    X1   += ks[((R)+2) % 9];                                        \
    X2   += ks[((R)+3) % 9];                                        \
    X3   += ks[((R)+4) % 9];                                        \
    X4   += ks[((R)+5) % 9];                                        \
    X5   += ks[((R)+6) % 9] + ts[((R)+1) % 3];                      \
    X6   += ks[((R)+7) % 9] + ts[((R)+2) % 3];                      \
    X7   += ks[((R)+8) % 9] +     (R)+1;                            \

#define R512_8_rounds(R) \
        R512(0,1,2,3,4,5,6,7,R_512_0,8*(R)+ 1);   \
        R512(2,1,4,7,6,5,0,3,R_512_1,8*(R)+ 2);   \
        R512(4,1,6,3,0,5,2,7,R_512_2,8*(R)+ 3);   \
        R512(6,1,0,7,2,5,4,3,R_512_3,8*(R)+ 4);   \
        I512(2*(R));                              \
        R512(0,1,2,3,4,5,6,7,R_512_4,8*(R)+ 5);   \
        R512(2,1,4,7,6,5,0,3,R_512_5,8*(R)+ 6);   \
        R512(4,1,6,3,0,5,2,7,R_512_6,8*(R)+ 7);   \
        R512(6,1,0,7,2,5,4,3,R_512_7,8*(R)+ 8);   \
        I512(2*(R)+1);

        R512_8_rounds( 0);
        R512_8_rounds( 1);
        R512_8_rounds( 2);
        R512_8_rounds( 3);
        R512_8_rounds( 4);
        R512_8_rounds( 5);
        R512_8_rounds( 6);
        R512_8_rounds( 7);
        R512_8_rounds( 8);

        ctx->X[0] = X0 ^ w[0];
        ctx->X[1] = X1 ^ w[1];
        ctx->X[2] = X2 ^ w[2];
        ctx->X[3] = X3 ^ w[3];
        ctx->X[4] = X4 ^ w[4];
        ctx->X[5] = X5 ^ w[5];
        ctx->X[6] = X6 ^ w[6];
        ctx->X[7] = X7 ^ w[7];

        ts[1] &= ~(((uint64)  64 ) << 56);
    } while (--blkCnt);
    ctx->T[0] = ts[0];
    ctx->T[1] = ts[1];
}