void KeccakP1600times4_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
    V256 *stateAsLanes = states;
    unsigned int i;
    const UINT64 *curData0 = (const UINT64 *)data;
    const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);
    const UINT64 *curData2 = (const UINT64 *)(data+laneOffset*2*SnP_laneLengthInBytes);
    const UINT64 *curData3 = (const UINT64 *)(data+laneOffset*3*SnP_laneLengthInBytes);
    V256    lanes0, lanes1, lanes2, lanes3, lanesL01, lanesL23, lanesH01, lanesH23;

    #define OverWr( argIndex )  STORE256(stateAsLanes[argIndex], LOAD4_64(curData3[argIndex], curData2[argIndex], curData1[argIndex], curData0[argIndex]))

    #define OverWr4( argIndex )     lanes0 = LOAD256u( curData0[argIndex]),\
                                    lanes1 = LOAD256u( curData1[argIndex]),\
                                    lanes2 = LOAD256u( curData2[argIndex]),\
                                    lanes3 = LOAD256u( curData3[argIndex]),\
                                    INTLEAVE(),\
                                    STORE256( stateAsLanes[argIndex+0], lanes0 ),\
                                    STORE256( stateAsLanes[argIndex+1], lanes1 ),\
                                    STORE256( stateAsLanes[argIndex+2], lanes2 ),\
                                    STORE256( stateAsLanes[argIndex+3], lanes3 )

    if ( laneCount >= 16 )  {
        OverWr4( 0 );
        OverWr4( 4 );
        OverWr4( 8 );
        OverWr4( 12 );
        if ( laneCount >= 20 )  {
            OverWr4( 16 );
            for(i=20; i<laneCount; i++)
                OverWr( i );
        }
        else {
            for(i=16; i<laneCount; i++)
                OverWr( i );
        }
    }
    else {
        for(i=0; i<laneCount; i++)
            OverWr( i );
    }
    #undef  OverWr
    #undef  OverWr4
}
void KeccakF1600_Pl2_OverwriteLanesAll(void *states, const unsigned char *data, unsigned int laneCount, unsigned int laneOffset)
{
    V128 *stateAsLanes = states;
    unsigned int i;
    const UINT64 *curData0 = (const UINT64 *)data;
    const UINT64 *curData1 = (const UINT64 *)(data+laneOffset*SnP_laneLengthInBytes);

	#define	OverWr( argIndex )	STORE128(stateAsLanes[argIndex], LOAD6464(curData1[argIndex], curData0[argIndex]))
	if ( laneCount >= 17 )	{
		OverWr( 0 );
		OverWr( 1 );
		OverWr( 2 );
		OverWr( 3 );
		OverWr( 4 );
		OverWr( 5 );
		OverWr( 6 );
		OverWr( 7 );
		OverWr( 8 );
		OverWr( 9 );
		OverWr( 10 );
		OverWr( 11 );
		OverWr( 12 );
		OverWr( 13 );
		OverWr( 14 );
		OverWr( 15 );
		OverWr( 16 );
		if ( laneCount >= 21 )	{
			OverWr( 17 );
			OverWr( 18 );
			OverWr( 19 );
			OverWr( 20 );
			for(i=21; i<laneCount; i++)
				OverWr( i );
		}
		else {
			for(i=17; i<laneCount; i++)
				OverWr( i );
		}
	}
	else {
		for(i=0; i<laneCount; i++)
			OverWr( i );
    }
	#undef	OverWr
}