Exemple #1
0
uint64_t avx2_count_byte(const uint8_t* data, size_t size, uint8_t byte) {

    const __m256i v = _mm256_set1_epi8(byte);

    const uint8_t* end = data + size;
    const uint8_t* ptr = data;

    __m256i global_sum = _mm256_setzero_si256();
    __m256i local_sum;

    // 1. blocks of 256 registers
    while (ptr + 255*32 < end) {
        local_sum = _mm256_setzero_si256();

        // update 32 x 8-bit counter
        for (int i=0; i < 255; i++, ptr += 32) {
            const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
            const __m256i eq = _mm256_cmpeq_epi8(in, v); // 0 or -1

            local_sum = _mm256_sub_epi8(local_sum, eq);
        }

        // update the global accumulator 2 x 64-bit
        const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
        global_sum = _mm256_add_epi64(global_sum, tmp);
    }


    // 2. tail of < 256 registers
    local_sum = _mm256_setzero_si256();
    while (ptr + 32 < end) {
        const __m256i in = _mm256_loadu_si256((const __m256i*)ptr);
        const __m256i eq = _mm256_cmpeq_epi8(in, v);

        local_sum = _mm256_sub_epi8(local_sum, eq);

        ptr += 32;
    }

    const __m256i tmp = _mm256_sad_epu8(local_sum, _mm256_setzero_si256());
    global_sum = _mm256_add_epi64(global_sum, tmp);


    // 3. process tail < 32 bytes
    uint64_t result = _mm256_extract_epi64(global_sum, 0)
                    + _mm256_extract_epi64(global_sum, 1)
                    + _mm256_extract_epi64(global_sum, 2)
                    + _mm256_extract_epi64(global_sum, 3);

    return result + scalar_count_bytes(ptr, end - ptr, byte);
}
		template <bool align> void SquaredDifferenceSum(
			const uint8_t *a, size_t aStride, const uint8_t *b, size_t bStride, 
			size_t width, size_t height, uint64_t * sum)
		{
			assert(width < 0x10000);
			if(align)
			{
				assert(Aligned(a) && Aligned(aStride) && Aligned(b) && Aligned(bStride));
			}

			size_t bodyWidth = AlignLo(width, A);
			__m256i tailMask = SetMask<uint8_t>(0, A - width + bodyWidth, 0xFF);
			__m256i fullSum = _mm256_setzero_si256();
			for(size_t row = 0; row < height; ++row)
			{
				__m256i rowSum = _mm256_setzero_si256();
				for(size_t col = 0; col < bodyWidth; col += A)
				{
					const __m256i a_ = Load<align>((__m256i*)(a + col));
					const __m256i b_ = Load<align>((__m256i*)(b + col)); 
					rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				if(width - bodyWidth)
				{
					const __m256i a_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(a + width - A)));
					const __m256i b_ = _mm256_and_si256(tailMask, Load<false>((__m256i*)(b + width - A))); 
					rowSum = _mm256_add_epi32(rowSum, SquaredDifference(a_, b_));
				}
				fullSum = _mm256_add_epi64(fullSum, HorizontalSum32(rowSum));
				a += aStride;
				b += bStride;
			}
			*sum = ExtractSum<uint64_t>(fullSum);
		}
Exemple #3
0
//int64
vec3l& vec3l::Add(int64 v) {
	__m256i vxmm = _mm256_set_epi64x(0, v, v, v);
	__m256i xmm = _mm256_set_epi64x(0, z, y, x);
	xmm = _mm256_add_epi64(xmm, vxmm);

	x = M256L(xmm, 0);
	y = M256L(xmm, 1); 
	z = M256L(xmm, 2);

	return *this;
}
Exemple #4
0
vec3l& vec3l::Add(const vec3l& v) {
	__m256i vxmm = _mm256_set_epi64x(0, v.z, v.y, v.x);
	__m256i xmm = _mm256_set_epi64x(0, z, y, x);
	xmm = _mm256_add_epi64(xmm, vxmm);

	x = M256L(xmm, 0);
	y = M256L(xmm, 1);
	z = M256L(xmm, 2);

	return *this;
}
static inline __m256i mulhi_epu64(__m256i x, __m256i y) {
  __m256i x_hi = _mm256_srli_epi64(x, 32);
  __m256i y_hi = _mm256_srli_epi64(y, 32);
//  __m256i mask = _mm256_set1_epi64x(0xFFFFFFFFL);
 // __m256i x_lo = _mm256_and_si256(x, mask);
//  __m256i y_lo = _mm256_and_si256(y, mask);
/// masking is unnecessary because _mm256_mul_epu32 does it for us (for free):
__m256i x_lo = x;
__m256i y_lo = y;
/////////////
  __m256i result = _mm256_mul_epu32(x_lo,y_lo);
  result = _mm256_srli_epi64(result, 32);
  __m256i result1 = _mm256_mul_epu32(x_hi,y_lo);
  __m256i result2 = _mm256_mul_epu32(x_lo,y_hi);
  result = _mm256_add_epi64(result, result1);
  result = _mm256_add_epi64(result, result2);
  result = _mm256_srli_epi64(result, 32);
  __m256i result3 = _mm256_mul_epu32(x_hi,y_hi);
  result = _mm256_add_epi64(result, result3);
  return result;
}
int64_t vp9_block_error_avx2(const int16_t *coeff,
                             const int16_t *dqcoeff,
                             intptr_t block_size,
                             int64_t *ssz) {
  __m256i sse_reg, ssz_reg, coeff_reg, dqcoeff_reg;
  __m256i exp_dqcoeff_lo, exp_dqcoeff_hi, exp_coeff_lo, exp_coeff_hi;
  __m256i sse_reg_64hi, ssz_reg_64hi;
  __m128i sse_reg128, ssz_reg128;
  int64_t sse;
  int i;
  const __m256i zero_reg = _mm256_set1_epi16(0);

  // init sse and ssz registerd to zero
  sse_reg = _mm256_set1_epi16(0);
  ssz_reg = _mm256_set1_epi16(0);

  for (i = 0 ; i < block_size ; i+= 16) {
    // load 32 bytes from coeff and dqcoeff
    coeff_reg = _mm256_loadu_si256((const __m256i *)(coeff + i));
    dqcoeff_reg = _mm256_loadu_si256((const __m256i *)(dqcoeff + i));
    // dqcoeff - coeff
    dqcoeff_reg = _mm256_sub_epi16(dqcoeff_reg, coeff_reg);
    // madd (dqcoeff - coeff)
    dqcoeff_reg = _mm256_madd_epi16(dqcoeff_reg, dqcoeff_reg);
    // madd coeff
    coeff_reg = _mm256_madd_epi16(coeff_reg, coeff_reg);
    // expand each double word of madd (dqcoeff - coeff) to quad word
    exp_dqcoeff_lo = _mm256_unpacklo_epi32(dqcoeff_reg, zero_reg);
    exp_dqcoeff_hi = _mm256_unpackhi_epi32(dqcoeff_reg, zero_reg);
    // expand each double word of madd (coeff) to quad word
    exp_coeff_lo = _mm256_unpacklo_epi32(coeff_reg, zero_reg);
    exp_coeff_hi = _mm256_unpackhi_epi32(coeff_reg, zero_reg);
    // add each quad word of madd (dqcoeff - coeff) and madd (coeff)
    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_lo);
    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_lo);
    sse_reg = _mm256_add_epi64(sse_reg, exp_dqcoeff_hi);
    ssz_reg = _mm256_add_epi64(ssz_reg, exp_coeff_hi);
  }
  // save the higher 64 bit of each 128 bit lane
  sse_reg_64hi = _mm256_srli_si256(sse_reg, 8);
  ssz_reg_64hi = _mm256_srli_si256(ssz_reg, 8);
  // add the higher 64 bit to the low 64 bit
  sse_reg = _mm256_add_epi64(sse_reg, sse_reg_64hi);
  ssz_reg = _mm256_add_epi64(ssz_reg, ssz_reg_64hi);

  // add each 64 bit from each of the 128 bit lane of the 256 bit
  sse_reg128 = _mm_add_epi64(_mm256_castsi256_si128(sse_reg),
                             _mm256_extractf128_si256(sse_reg, 1));

  ssz_reg128 = _mm_add_epi64(_mm256_castsi256_si128(ssz_reg),
                             _mm256_extractf128_si256(ssz_reg, 1));

  // store the results
  _mm_storel_epi64((__m128i*)(&sse), sse_reg128);

  _mm_storel_epi64((__m128i*)(ssz), ssz_reg128);
  return sse;
}
static uint64_t popcnt_harley_seal(const __m512i* data, const uint64_t size)
{
  __m256i total     = _mm256_setzero_si256();
  __m512i ones      = _mm512_setzero_si512();
  __m512i twos      = _mm512_setzero_si512();
  __m512i fours     = _mm512_setzero_si512();
  __m512i eights    = _mm512_setzero_si512();
  __m512i sixteens  = _mm512_setzero_si512();
  __m512i twosA, twosB, foursA, foursB, eightsA, eightsB;

  const uint64_t limit = size - size % 16;
  uint64_t i = 0;

  for(; i < limit; i += 16)
  {
    CSA(&twosA, &ones, ones, data[i+0], data[i+1]);
    CSA(&twosB, &ones, ones, data[i+2], data[i+3]);
    CSA(&foursA, &twos, twos, twosA, twosB);
    CSA(&twosA, &ones, ones, data[i+4], data[i+5]);
    CSA(&twosB, &ones, ones, data[i+6], data[i+7]);
    CSA(&foursB, &twos, twos, twosA, twosB);
    CSA(&eightsA,&fours, fours, foursA, foursB);
    CSA(&twosA, &ones, ones, data[i+8], data[i+9]);
    CSA(&twosB, &ones, ones, data[i+10], data[i+11]);
    CSA(&foursA, &twos, twos, twosA, twosB);
    CSA(&twosA, &ones, ones, data[i+12], data[i+13]);
    CSA(&twosB, &ones, ones, data[i+14], data[i+15]);
    CSA(&foursB, &twos, twos, twosA, twosB);
    CSA(&eightsB, &fours, fours, foursA, foursB);
    CSA(&sixteens, &eights, eights, eightsA, eightsB);

    total = _mm256_add_epi64(total, popcount(sixteens));
  }

  total = _mm256_slli_epi64(total, 4);     // * 16
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(eights), 3)); // += 8 * ...
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(fours),  2)); // += 4 * ...
  total = _mm256_add_epi64(total, _mm256_slli_epi64(popcount(twos),   1)); // += 2 * ...
  total = _mm256_add_epi64(total, popcount(ones));

  for(; i < size; i++) {
    total = _mm256_add_epi64(total, popcount(data[i]));
  }


  return avx2_sum_epu64(total);
}
Exemple #8
0
__m256i test_mm256_add_epi64(__m256i a, __m256i b) {
  // CHECK: add <4 x i64>
  return _mm256_add_epi64(a, b);
}
Exemple #9
0
 /*!
  * \brief Add the two given values and return the result.
  */
 ETL_STATIC_INLINE(avx_simd_long) add(avx_simd_long lhs, avx_simd_long rhs) {
     return _mm256_add_epi64(lhs.value, rhs.value);
 }
static FORCE_INLINE void FlowInter_8px_AVX2(
        int w, PixelType *pdst,
        const PixelType *prefB, const PixelType *prefF,
        const int16_t *VXFullB, const int16_t *VXFullF,
        const int16_t *VYFullB, const int16_t *VYFullF,
        const uint8_t *MaskB, const uint8_t *MaskF,
        int nPelLog,
        const __m256i &dwords_time256, const __m256i &dwords_256_time256,
        const __m256i &dwords_ref_pitch, const __m256i &dwords_hoffsets) {

    __m256i dwords_w = _mm256_add_epi32(_mm256_set1_epi32(w << nPelLog), dwords_hoffsets);

    __m256i dstF = lookup_AVX2(VXFullF, VYFullF, prefF, w, dwords_time256, dwords_ref_pitch, dwords_w);
    __m256i dstB = lookup_AVX2(VXFullB, VYFullB, prefB, w, dwords_256_time256, dwords_ref_pitch, dwords_w);

    __m256i dstF0 = _mm256_i32gather_epi32((const int *)prefF, dwords_w, sizeof(PixelType));
    __m256i dstB0 = _mm256_i32gather_epi32((const int *)prefB, dwords_w, sizeof(PixelType));
    dstF0 = _mm256_and_si256(dstF0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1));
    dstB0 = _mm256_and_si256(dstB0, _mm256_set1_epi32((1 << (sizeof(PixelType) * 8)) - 1));

    __m256i maskf = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskF[w]));
    __m256i maskb = _mm256_cvtepu8_epi32(_mm_loadl_epi64((const __m128i *)&MaskB[w]));

    const __m256i dwords_255 = _mm256_set1_epi32(255);

    __m256i maskf_inv = _mm256_sub_epi32(dwords_255, maskf);
    __m256i maskb_inv = _mm256_sub_epi32(dwords_255, maskb);

    __m256i dstF_maskf_inv, dstB_maskb_inv, dstF0_maskb, dstB0_maskf;

    if (sizeof(PixelType) == 1) {
        dstF_maskf_inv = _mm256_mullo_epi16(dstF, maskf_inv);
        dstB_maskb_inv = _mm256_mullo_epi16(dstB, maskb_inv);

        dstF0_maskb = _mm256_mullo_epi16(dstF0, maskb);
        dstB0_maskf = _mm256_mullo_epi16(dstB0, maskf);
    } else {
        dstF_maskf_inv = _mm256_mullo_epi32(dstF, maskf_inv);
        dstB_maskb_inv = _mm256_mullo_epi32(dstB, maskb_inv);

        dstF0_maskb = _mm256_mullo_epi32(dstF0, maskb);
        dstB0_maskf = _mm256_mullo_epi32(dstB0, maskf);
    }

    __m256i f = _mm256_add_epi32(dstF0_maskb, dstB_maskb_inv);
    __m256i b = _mm256_add_epi32(dstB0_maskf, dstF_maskf_inv);

    if (sizeof(PixelType) == 1) {
        f = _mm256_mullo_epi32(f, maskf);
        b = _mm256_mullo_epi32(b, maskb);

        f = _mm256_add_epi32(f, dwords_255);
        b = _mm256_add_epi32(b, dwords_255);

        f = _mm256_srai_epi32(f, 8);
        b = _mm256_srai_epi32(b, 8);
    } else {
        const __m256i qwords_255 = _mm256_set1_epi64x(255);

        __m256i tempf = _mm256_mul_epu32(f, maskf);
        __m256i tempb = _mm256_mul_epu32(b, maskb);
        tempf = _mm256_add_epi64(tempf, qwords_255);
        tempb = _mm256_add_epi64(tempb, qwords_255);
        tempf = _mm256_srli_epi64(tempf, 8);
        tempb = _mm256_srli_epi64(tempb, 8);

        f = _mm256_srli_epi64(f, 32);
        b = _mm256_srli_epi64(b, 32);
        f = _mm256_mul_epu32(f, _mm256_srli_epi64(maskf, 32));
        b = _mm256_mul_epu32(b, _mm256_srli_epi64(maskb, 32));
        f = _mm256_add_epi64(f, qwords_255);
        b = _mm256_add_epi64(b, qwords_255);
        f = _mm256_srli_epi64(f, 8);
        b = _mm256_srli_epi64(b, 8);
        f = _mm256_or_si256(tempf, _mm256_slli_epi64(f, 32));
        b = _mm256_or_si256(tempb, _mm256_slli_epi64(b, 32));
    }

    f = _mm256_add_epi32(f, dstF_maskf_inv);
    b = _mm256_add_epi32(b, dstB_maskb_inv);

    f = _mm256_add_epi32(f, dwords_255);
    b = _mm256_add_epi32(b, dwords_255);

    f = _mm256_srai_epi32(f, 8);
    b = _mm256_srai_epi32(b, 8);

    if (sizeof(PixelType) == 1) {
        f = _mm256_madd_epi16(f, dwords_256_time256);
        b = _mm256_madd_epi16(b, dwords_time256);
    } else {
        f = _mm256_mullo_epi32(f, dwords_256_time256);
        b = _mm256_mullo_epi32(b, dwords_time256);
    }

    __m256i dst = _mm256_add_epi32(f, b);
    dst = _mm256_srai_epi32(dst, 8);

    dst = _mm256_packus_epi32(dst, dst);
    dst = _mm256_permute4x64_epi64(dst, 0xe8); // 0b11101000 - copy third qword to second qword
    __m128i dst128 = _mm256_castsi256_si128(dst);

    if (sizeof(PixelType) == 1) {
        dst128 = _mm_packus_epi16(dst128, dst128);
        _mm_storel_epi64((__m128i *)&pdst[w], dst128);
    } else {
        _mm_storeu_si128((__m128i *)&pdst[w], dst128);
    }
}
void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
		uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
{
	const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
	uint32_t partitions = 1u << max_partition_order;

	FLAC__ASSERT(default_partition_samples > predictor_order);

	/* first do max_partition_order */
	{
		const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
		uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);

		if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m256i sum256 = _mm256_setzero_si256();
				__m128i sum128;
				end += default_partition_samples;

				for( ; (int)residual_sample < (int)end-7; residual_sample+=8) {
					__m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample)));
					sum256 = _mm256_add_epi32(sum256, res256);
				}

				sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));

				for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
					__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
					sum128 = _mm_add_epi32(sum128, res128);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					sum128 = _mm_add_epi32(sum128, res128);
				}

				sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_SHUFFLE(1,0,3,2)));
				sum128 = _mm_add_epi32(sum128, _mm_shufflelo_epi16(sum128, _MM_SHUFFLE(1,0,3,2)));
				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
/* workaround for MSVC bugs (at least versions 2015 and 2017 are affected) */
#if (defined _MSC_VER) && (defined FLAC__CPU_X86_64)
				abs_residual_partition_sums[partition] &= 0xFFFFFFFF; /**/
#endif
			}
		}
		else { /* have to pessimistically use 64 bits for accumulator */
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m256i sum256 = _mm256_setzero_si256();
				__m128i sum128;
				end += default_partition_samples;

				for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
					__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
					__m256i res256 = _mm256_cvtepu32_epi64(res128);
					sum256 = _mm256_add_epi64(sum256, res256);
				}

				sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));

				for( ; (int)residual_sample < (int)end-1; residual_sample+=2) {
					__m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample)));
					res128 = _mm_cvtepu32_epi64(res128);
					sum128 = _mm_add_epi64(sum128, res128);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					sum128 = _mm_add_epi64(sum128, res128);
				}

				sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128);
			}
		}
	}

	/* now merge partitions for lower orders */
	{
		uint32_t from_partition = 0, to_partition = partitions;
		int partition_order;
		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
			uint32_t i;
			partitions >>= 1;
			for(i = 0; i < partitions; i++) {
				abs_residual_partition_sums[to_partition++] =
					abs_residual_partition_sums[from_partition  ] +
					abs_residual_partition_sums[from_partition+1];
				from_partition += 2;
			}
		}
	}
	_mm256_zeroupper();
}
Exemple #12
0
/**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][col] = M[rowOut][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
void reducedDuplexRow(uint64_t *state, uint64_t *rowIn, uint64_t *rowInOut, uint64_t *rowOut, const uint32_t nCols)
{
   uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
   uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
   uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
   unsigned int i;

   for (i = 0; i < nCols; i++)
   {

   //Absorbing "M[prev] [+] M[row*]"
   #if defined __AVX2__

       __m256i state_v[3], in_v[3], inout_v[3];
       #define out_v in_v    // reuse register in next code block

       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );

       _mm256_store_si256( (__m256i*)(&state[0]),
                             _mm256_xor_si256( state_v[0], 
                                               _mm256_add_epi64( in_v[0],
                                                               inout_v[0] ) ) );
       _mm256_store_si256( (__m256i*)(&state[4]), 
                            _mm256_xor_si256( state_v[1],
                                               _mm256_add_epi64( in_v[1],
                                                               inout_v[1] ) ) );
       _mm256_store_si256( (__m256i*)(&state[8]), 
                            _mm256_xor_si256( state_v[2], 
                                              _mm256_add_epi64( in_v[2],
                                                               inout_v[2] ) ) );
   #elif defined __AVX__

       __m128i state_v[6], in_v[6], inout_v[6];
       #define out_v in_v    // reuse register in next code block

       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

       inout_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[0]) );
       inout_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[2]) );
       inout_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[4]) );
       inout_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[6]) );
       inout_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[8]) );
       inout_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordInOut[10]) );

       in_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[0]) );
       in_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[2]) );
       in_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[4]) );
       in_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[6]) );
       in_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[8]) );
       in_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordIn[10]) );

       _mm_store_si128( (__m128i*)(&state[0]),
                         _mm_xor_si128( state_v[0],
                                        _mm_add_epi64( in_v[0],
                                                       inout_v[0] ) ) );
       _mm_store_si128( (__m128i*)(&state[2]),
                         _mm_xor_si128( state_v[1],
                                        _mm_add_epi64( in_v[1],
                                                       inout_v[1] ) ) );
       _mm_store_si128( (__m128i*)(&state[4]),
                         _mm_xor_si128( state_v[2],
                                        _mm_add_epi64( in_v[2],
                                                       inout_v[2] ) ) );
       _mm_store_si128( (__m128i*)(&state[6]),
                         _mm_xor_si128( state_v[3],
                                        _mm_add_epi64( in_v[3],
                                                       inout_v[3] ) ) );
       _mm_store_si128( (__m128i*)(&state[8]),
                         _mm_xor_si128( state_v[4],
                                        _mm_add_epi64( in_v[4],
                                                       inout_v[4] ) ) );
       _mm_store_si128( (__m128i*)(&state[10]),
                         _mm_xor_si128( state_v[5],
                                        _mm_add_epi64( in_v[5],
                                                       inout_v[5] ) ) );

   #else

       state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
       state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
       state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
       state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
       state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
       state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
       state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
       state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
       state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
       state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
       state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
       state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
    #endif

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);

    //M[rowOut][col] = M[rowOut][col] XOR rand
    #if defined __AVX2__

       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
       out_v  [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
       out_v  [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
       out_v  [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                            _mm256_xor_si256( state_v[0], out_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                            _mm256_xor_si256( state_v[1], out_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                            _mm256_xor_si256( state_v[2], out_v[2] ) );

    #elif defined __AVX__

       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

       out_v[0]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[0]) );
       out_v[1]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[2]) );
       out_v[2]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[4]) );
       out_v[3]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[6]) );
       out_v[4]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[8]) );
       out_v[5]    = _mm_loadu_si128( (__m128i*)(&ptrWordOut[10]) );

       _mm_storeu_si128( (__m128i*)(&ptrWordOut[0]),
                         _mm_xor_si128( state_v[0], out_v[0] ) );
       _mm_storeu_si128( (__m128i*)(&ptrWordOut[2]),
                         _mm_xor_si128( state_v[1], out_v[1] ) );
       _mm_storeu_si128( (__m128i*)(&ptrWordOut[4]),
                         _mm_xor_si128( state_v[2], out_v[2] ) );
       _mm_storeu_si128( (__m128i*)(&ptrWordOut[6]),
                         _mm_xor_si128( state_v[3], out_v[3] ) );
       _mm_storeu_si128( (__m128i*)(&ptrWordOut[8]),
                         _mm_xor_si128( state_v[4], out_v[4] ) );
       _mm_storeu_si128( (__m128i*)(&ptrWordOut[10]),
                         _mm_xor_si128( state_v[5], out_v[5] ) );

    #else

       ptrWordOut[0] ^= state[0];
       ptrWordOut[1] ^= state[1];
       ptrWordOut[2] ^= state[2];
       ptrWordOut[3] ^= state[3];
       ptrWordOut[4] ^= state[4];
       ptrWordOut[5] ^= state[5];
       ptrWordOut[6] ^= state[6];
       ptrWordOut[7] ^= state[7];
       ptrWordOut[8] ^= state[8];
       ptrWordOut[9] ^= state[9];
       ptrWordOut[10] ^= state[10];
       ptrWordOut[11] ^= state[11];

    #endif

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

    //Goes to next block
    ptrWordOut += BLOCK_LEN_INT64;
    ptrWordInOut += BLOCK_LEN_INT64;
    ptrWordIn += BLOCK_LEN_INT64;
  }
}
Exemple #13
0
void extern
avx2_test (void)
{
  x = _mm256_add_epi64 (x, x);
}
void FLAC__precompute_partition_info_sums_intrin_avx2(const FLAC__int32 residual[], FLAC__uint64 abs_residual_partition_sums[],
		uint32_t residual_samples, uint32_t predictor_order, uint32_t min_partition_order, uint32_t max_partition_order, uint32_t bps)
{
	const uint32_t default_partition_samples = (residual_samples + predictor_order) >> max_partition_order;
	uint32_t partitions = 1u << max_partition_order;

	FLAC__ASSERT(default_partition_samples > predictor_order);

	/* first do max_partition_order */
	{
		const uint32_t threshold = 32 - FLAC__bitmath_ilog2(default_partition_samples);
		uint32_t partition, residual_sample, end = (uint32_t)(-(int32_t)predictor_order);

		if(bps + FLAC__MAX_EXTRA_RESIDUAL_BPS < threshold) {
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m256i sum256 = _mm256_setzero_si256();
				__m128i sum128;
				end += default_partition_samples;

				for( ; (int)residual_sample < (int)end-7; residual_sample+=8) {
					__m256i res256 = _mm256_abs_epi32(_mm256_loadu_si256((const __m256i*)(residual+residual_sample)));
					sum256 = _mm256_add_epi32(sum256, res256);
				}

				sum128 = _mm_add_epi32(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));

				for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
					__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
					sum128 = _mm_add_epi32(sum128, res128);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					sum128 = _mm_add_epi32(sum128, res128);
				}

				sum128 = _mm_hadd_epi32(sum128, sum128);
				sum128 = _mm_hadd_epi32(sum128, sum128);
				abs_residual_partition_sums[partition] = (FLAC__uint32)_mm_cvtsi128_si32(sum128);
/* workaround for a bug in MSVC2015U2 - see https://connect.microsoft.com/VisualStudio/feedback/details/2659191/incorrect-code-generation-for-x86-64 */
#if (defined _MSC_VER) && (_MSC_FULL_VER == 190023918) && (defined FLAC__CPU_X86_64)
				abs_residual_partition_sums[partition] &= 0xFFFFFFFF; /**/
#endif
			}
		}
		else { /* have to pessimistically use 64 bits for accumulator */
			for(partition = residual_sample = 0; partition < partitions; partition++) {
				__m256i sum256 = _mm256_setzero_si256();
				__m128i sum128;
				end += default_partition_samples;

				for( ; (int)residual_sample < (int)end-3; residual_sample+=4) {
					__m128i res128 = _mm_abs_epi32(_mm_loadu_si128((const __m128i*)(residual+residual_sample)));
					__m256i res256 = _mm256_cvtepu32_epi64(res128);
					sum256 = _mm256_add_epi64(sum256, res256);
				}

				sum128 = _mm_add_epi64(_mm256_extracti128_si256(sum256, 1), _mm256_castsi256_si128(sum256));

				for( ; (int)residual_sample < (int)end-1; residual_sample+=2) {
					__m128i res128 = _mm_abs_epi32(_mm_loadl_epi64((const __m128i*)(residual+residual_sample)));
					res128 = _mm_cvtepu32_epi64(res128);
					sum128 = _mm_add_epi64(sum128, res128);
				}

				for( ; residual_sample < end; residual_sample++) {
					__m128i res128 = _mm_abs_epi32(_mm_cvtsi32_si128(residual[residual_sample]));
					sum128 = _mm_add_epi64(sum128, res128);
				}

				sum128 = _mm_add_epi64(sum128, _mm_srli_si128(sum128, 8));
				_mm_storel_epi64((__m128i*)(abs_residual_partition_sums+partition), sum128);
			}
		}
	}

	/* now merge partitions for lower orders */
	{
		uint32_t from_partition = 0, to_partition = partitions;
		int partition_order;
		for(partition_order = (int)max_partition_order - 1; partition_order >= (int)min_partition_order; partition_order--) {
			uint32_t i;
			partitions >>= 1;
			for(i = 0; i < partitions; i++) {
				abs_residual_partition_sums[to_partition++] =
					abs_residual_partition_sums[from_partition  ] +
					abs_residual_partition_sums[from_partition+1];
				from_partition += 2;
			}
		}
	}
	_mm256_zeroupper();
}
Exemple #15
0
/**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][(N_COLS-1)-col] = M[rowIn][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left and N_COLS is a system parameter.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
inline void reducedDuplexRowSetup( uint64_t *state, uint64_t *rowIn,
                                   uint64_t *rowInOut, uint64_t *rowOut,
                                   uint64_t nCols )
{
    uint64_t* ptrWordIn = rowIn;	//In Lyra2: pointer to prev
    uint64_t* ptrWordInOut = rowInOut;	//In Lyra2: pointer to row*
    uint64_t* ptrWordOut = rowOut + (nCols-1)*BLOCK_LEN_INT64; //In Lyra2: pointer to row
    int i;

#if defined __AVX2__
    __m256i state_v[4], in_v[3], inout_v[3];
    #define t_state in_v

    state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
    state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
    state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
    state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );

    for ( i = 0; i < nCols; i++ )
    {
       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );

       state_v[0] = _mm256_xor_si256( state_v[0],  _mm256_add_epi64( in_v[0],
                                                                inout_v[0] ) );
       state_v[1] = _mm256_xor_si256( state_v[1],  _mm256_add_epi64( in_v[1],
                                                                inout_v[1] ) );
       state_v[2] = _mm256_xor_si256( state_v[2],  _mm256_add_epi64( in_v[2],
                                                                inout_v[2] ) );

       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                              _mm256_xor_si256( state_v[0], in_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                              _mm256_xor_si256( state_v[1], in_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                              _mm256_xor_si256( state_v[2], in_v[2] ) );

       //M[row*][col] = M[row*][col] XOR rotW(rand)
      t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
      t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
      t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );

      inout_v[0] = _mm256_xor_si256( inout_v[0],
                         _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
      inout_v[1] = _mm256_xor_si256( inout_v[1],
                         _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
      inout_v[2] = _mm256_xor_si256( inout_v[2],
                         _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );

      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[0], inout_v[0] );
      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[4], inout_v[1] );
      _mm256_storeu_si256( (__m256i*)&ptrWordInOut[8], inout_v[2] );

       //Inputs: next column (i.e., next block in sequence)
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
       //Output: goes to previous column
       ptrWordOut -= BLOCK_LEN_INT64;
    }

    _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
    _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
    _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
    _mm256_store_si256( (__m256i*)&state[12], state_v[3] );

    #undef t_state 

#elif defined __AVX__

        __m128i state_v[6], in_v[6], inout_v[6];

    for ( i = 0; i < nCols; i++ )
    {

        state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
        state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
        state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
        state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
        state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
        state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

        inout_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) );
        inout_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) );
        inout_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) );
        inout_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) );
        inout_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) );
        inout_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) );

        in_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) );
        in_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) );
        in_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) );
        in_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) );
        in_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) );
        in_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) );

        _mm_store_si128( (__m128i*)(&state[0]),
                          _mm_xor_si128( state_v[0],
                                         _mm_add_epi64( in_v[0],
                                                        inout_v[0] ) ) );
        _mm_store_si128( (__m128i*)(&state[2]),
                          _mm_xor_si128( state_v[1],
                                         _mm_add_epi64( in_v[1],
                                                        inout_v[1] ) ) );
        _mm_store_si128( (__m128i*)(&state[4]),
                          _mm_xor_si128( state_v[2],
                                         _mm_add_epi64( in_v[2],
                                                        inout_v[2] ) ) );
        _mm_store_si128( (__m128i*)(&state[6]),
                          _mm_xor_si128( state_v[3],
                                         _mm_add_epi64( in_v[3],
                                                        inout_v[3] ) ) );
        _mm_store_si128( (__m128i*)(&state[8]),
                          _mm_xor_si128( state_v[4],
                                         _mm_add_epi64( in_v[4],
                                                        inout_v[4] ) ) );
        _mm_store_si128( (__m128i*)(&state[10]),
                          _mm_xor_si128( state_v[5],
                                         _mm_add_epi64( in_v[5],
                                                        inout_v[5] ) ) );

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);
#else
    for ( i = 0; i < nCols; i++ )
    {

    //Absorbing "M[prev] [+] M[row*]"
    state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
    state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
    state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
    state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
    state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
    state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
    state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
    state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
    state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
    state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
    state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
    state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);
    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);

    //M[row][col] = M[prev][col] XOR rand
#endif


      #if defined __AVX2__

      #elif defined __AVX__

         state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
         state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
         state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
         state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
         state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
         state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

         _mm_store_si128( (__m128i*)(&ptrWordOut[0]),
                           _mm_xor_si128( state_v[0], in_v[0] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[2]),
                           _mm_xor_si128( state_v[1], in_v[1] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[4]),
                           _mm_xor_si128( state_v[2], in_v[2] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[6]),
                           _mm_xor_si128( state_v[3], in_v[3] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[8]),
                           _mm_xor_si128( state_v[4], in_v[4] ) );
         _mm_store_si128( (__m128i*)(&ptrWordOut[10]),
                           _mm_xor_si128( state_v[5], in_v[5] ) );

      #else

    ptrWordOut[0] = ptrWordIn[0]  ^ state[0];
    ptrWordOut[1] = ptrWordIn[1]  ^ state[1];
    ptrWordOut[2] = ptrWordIn[2]  ^ state[2];
    ptrWordOut[3] = ptrWordIn[3]  ^ state[3];
    ptrWordOut[4] = ptrWordIn[4]  ^ state[4];
    ptrWordOut[5] = ptrWordIn[5]  ^ state[5];
    ptrWordOut[6] = ptrWordIn[6]  ^ state[6];
    ptrWordOut[7] = ptrWordIn[7]  ^ state[7];
    ptrWordOut[8] = ptrWordIn[8]  ^ state[8];
    ptrWordOut[9] = ptrWordIn[9]  ^ state[9];
    ptrWordOut[10] = ptrWordIn[10] ^ state[10];
    ptrWordOut[11] = ptrWordIn[11] ^ state[11];
#endif

    //M[row*][col] = M[row*][col] XOR rotW(rand)
// Need to fix this before taking state load/store out of loop
#ifdef __AVX2__


#else

    ptrWordInOut[0]  ^= state[11];
    ptrWordInOut[1]  ^= state[0];
    ptrWordInOut[2]  ^= state[1];
    ptrWordInOut[3]  ^= state[2];
    ptrWordInOut[4]  ^= state[3];
    ptrWordInOut[5]  ^= state[4];
    ptrWordInOut[6]  ^= state[5];
    ptrWordInOut[7]  ^= state[6];
    ptrWordInOut[8]  ^= state[7];
    ptrWordInOut[9]  ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Inputs: next column (i.e., next block in sequence)
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
       //Output: goes to previous column
       ptrWordOut -= BLOCK_LEN_INT64;
    }

#endif

}

/**
 * Performs a duplexing operation over "M[rowInOut][col] [+] M[rowIn][col]" (i.e.,
 * the wordwise addition of two columns, ignoring carries between words). The
 * output of this operation, "rand", is then used to make
 * "M[rowOut][col] = M[rowOut][col] XOR rand" and
 * "M[rowInOut][col] =  M[rowInOut][col] XOR rotW(rand)", where rotW is a 64-bit
 * rotation to the left.
 *
 * @param state          The current state of the sponge
 * @param rowIn          Row used only as input
 * @param rowInOut       Row used as input and to receive output after rotation
 * @param rowOut         Row receiving the output
 *
 */
inline void reducedDuplexRow( uint64_t *state, uint64_t *rowIn,
                              uint64_t *rowInOut, uint64_t *rowOut,
                              uint64_t nCols )
{
    uint64_t* ptrWordInOut = rowInOut; //In Lyra2: pointer to row*
    uint64_t* ptrWordIn = rowIn; //In Lyra2: pointer to prev
    uint64_t* ptrWordOut = rowOut; //In Lyra2: pointer to row
    int i;


#if defined __AVX2__

    for ( i = 0; i < nCols; i++)
    {

       //Absorbing "M[prev] [+] M[row*]"

       __m256i state_v[4], in_v[3], inout_v[3];
       #define out_v in_v    // reuse register in next code block
       #define t_state in_v
       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
       in_v   [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[0]) );
       inout_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[0]) );
       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
       in_v   [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[4]) );
       inout_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[4]) );
       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
       in_v   [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordIn[8]) );
       inout_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordInOut[8]) );
       state_v[3] = _mm256_load_si256( (__m256i*)(&state[12]) );

       state_v[0] = _mm256_xor_si256( state_v[0], _mm256_add_epi64( in_v[0],
                                                               inout_v[0] ) );
       state_v[1] = _mm256_xor_si256( state_v[1], _mm256_add_epi64( in_v[1],
                                                               inout_v[1] ) );
       state_v[2] = _mm256_xor_si256( state_v[2], _mm256_add_epi64( in_v[2],
                                                               inout_v[2] ) );

       out_v[0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
       out_v[1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
       out_v[2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );

       LYRA_ROUND_AVX2( state_v[0], state_v[1], state_v[2], state_v[3] );

       _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
       _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
       _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
       _mm256_store_si256( (__m256i*)&state[12], state_v[3] );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                            _mm256_xor_si256( state_v[0], out_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                            _mm256_xor_si256( state_v[1], out_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                            _mm256_xor_si256( state_v[2], out_v[2] ) );

/*
       t_state[0] = _mm256_permute4x64_epi64( state_v[0], 0x93 );
       t_state[1] = _mm256_permute4x64_epi64( state_v[1], 0x93 );
       t_state[2] = _mm256_permute4x64_epi64( state_v[2], 0x93 );

       inout_v[0] = _mm256_xor_si256( inout_v[0],
                        _mm256_blend_epi32( t_state[0], t_state[2], 0x03 ) );
       inout_v[1] = _mm256_xor_si256( inout_v[1], 
                        _mm256_blend_epi32( t_state[1], t_state[0], 0x03 ) );
       inout_v[2] = _mm256_xor_si256( inout_v[2], 
                        _mm256_blend_epi32( t_state[2], t_state[1], 0x03 ) );

       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[0]), inout_v[0] );
       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[4]), inout_v[1] );
       _mm256_storeu_si256( (__m256i*)(&ptrWordInOut[8]), inout_v[2] );

       _mm256_store_si256( (__m256i*)&state[0], state_v[0] );
       _mm256_store_si256( (__m256i*)&state[4], state_v[1] );
       _mm256_store_si256( (__m256i*)&state[8], state_v[2] );
       _mm256_store_si256( (__m256i*)&state[12], state_v[3] );
*/
       #undef out_v
       #undef t_state 

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Goes to next block
       ptrWordOut += BLOCK_LEN_INT64;
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
    }

   #elif defined __AVX__

    for ( i = 0; i < nCols; i++)
    {

       __m128i state_v[6], in_v[6], inout_v[6];
       #define out_v in_v    // reuse register in next code block

       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

       inout_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[0]) );
       inout_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[2]) );
       inout_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[4]) );
       inout_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[6]) );
       inout_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[8]) );
       inout_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordInOut[10]) );

       in_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordIn[0]) );
       in_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordIn[2]) );
       in_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordIn[4]) );
       in_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordIn[6]) );
       in_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordIn[8]) );
       in_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordIn[10]) );

       _mm_store_si128( (__m128i*)(&state[0]),
                         _mm_xor_si128( state_v[0],
                                        _mm_add_epi64( in_v[0],
                                                       inout_v[0] ) ) );
       _mm_store_si128( (__m128i*)(&state[2]),
                         _mm_xor_si128( state_v[1],
                                        _mm_add_epi64( in_v[1],
                                                       inout_v[1] ) ) );
       _mm_store_si128( (__m128i*)(&state[4]),
                         _mm_xor_si128( state_v[2],
                                        _mm_add_epi64( in_v[2],
                                                       inout_v[2] ) ) );
       _mm_store_si128( (__m128i*)(&state[6]),
                         _mm_xor_si128( state_v[3],
                                        _mm_add_epi64( in_v[3],
                                                       inout_v[3] ) ) );
       _mm_store_si128( (__m128i*)(&state[8]),
                         _mm_xor_si128( state_v[4],
                                        _mm_add_epi64( in_v[4],
                                                       inout_v[4] ) ) );
       _mm_store_si128( (__m128i*)(&state[10]),
                         _mm_xor_si128( state_v[5],
                                        _mm_add_epi64( in_v[5],
                                                       inout_v[5] ) ) );

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);

   #else

    for ( i = 0; i < nCols; i++)
    {

    state[0]  ^= (ptrWordIn[0]  + ptrWordInOut[0]);
    state[1]  ^= (ptrWordIn[1]  + ptrWordInOut[1]);
    state[2]  ^= (ptrWordIn[2]  + ptrWordInOut[2]);
    state[3]  ^= (ptrWordIn[3]  + ptrWordInOut[3]);
    state[4]  ^= (ptrWordIn[4]  + ptrWordInOut[4]);
    state[5]  ^= (ptrWordIn[5]  + ptrWordInOut[5]);
    state[6]  ^= (ptrWordIn[6]  + ptrWordInOut[6]);
    state[7]  ^= (ptrWordIn[7]  + ptrWordInOut[7]);
    state[8]  ^= (ptrWordIn[8]  + ptrWordInOut[8]);
    state[9]  ^= (ptrWordIn[9]  + ptrWordInOut[9]);
    state[10] ^= (ptrWordIn[10] + ptrWordInOut[10]);
    state[11] ^= (ptrWordIn[11] + ptrWordInOut[11]);

    //Applies the reduced-round transformation f to the sponge's state
    reducedBlake2bLyra(state);
#endif

    //M[rowOut][col] = M[rowOut][col] XOR rand

    #if defined __AVX2__
/*
       state_v[0] = _mm256_load_si256( (__m256i*)(&state[0]) );
       out_v  [0] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[0]) );
       state_v[1] = _mm256_load_si256( (__m256i*)(&state[4]) );
       out_v  [1] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[4]) );
       state_v[2] = _mm256_load_si256( (__m256i*)(&state[8]) );
       out_v  [2] = _mm256_loadu_si256( (__m256i*)(&ptrWordOut[8]) );

       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[0]),
                            _mm256_xor_si256( state_v[0], out_v[0] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[4]),
                            _mm256_xor_si256( state_v[1], out_v[1] ) );
       _mm256_storeu_si256( (__m256i*)(&ptrWordOut[8]),
                            _mm256_xor_si256( state_v[2], out_v[2] ) );
*/
    #elif defined __AVX__

       state_v[0] = _mm_load_si128( (__m128i*)(&state[0]) );
       state_v[1] = _mm_load_si128( (__m128i*)(&state[2]) );
       state_v[2] = _mm_load_si128( (__m128i*)(&state[4]) );
       state_v[3] = _mm_load_si128( (__m128i*)(&state[6]) );
       state_v[4] = _mm_load_si128( (__m128i*)(&state[8]) );
       state_v[5] = _mm_load_si128( (__m128i*)(&state[10]) );

       out_v[0]    = _mm_load_si128( (__m128i*)(&ptrWordOut[0]) );
       out_v[1]    = _mm_load_si128( (__m128i*)(&ptrWordOut[2]) );
       out_v[2]    = _mm_load_si128( (__m128i*)(&ptrWordOut[4]) );
       out_v[3]    = _mm_load_si128( (__m128i*)(&ptrWordOut[6]) );
       out_v[4]    = _mm_load_si128( (__m128i*)(&ptrWordOut[8]) );
       out_v[5]    = _mm_load_si128( (__m128i*)(&ptrWordOut[10]) );

       _mm_store_si128( (__m128i*)(&ptrWordOut[0]),
                         _mm_xor_si128( state_v[0], out_v[0] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[2]),
                         _mm_xor_si128( state_v[1], out_v[1] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[4]),
                         _mm_xor_si128( state_v[2], out_v[2] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[6]),
                         _mm_xor_si128( state_v[3], out_v[3] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[8]),
                         _mm_xor_si128( state_v[4], out_v[4] ) );
       _mm_store_si128( (__m128i*)(&ptrWordOut[10]),
                         _mm_xor_si128( state_v[5], out_v[5] ) );

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Goes to next block
       ptrWordOut += BLOCK_LEN_INT64;
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
    }

#else
    ptrWordOut[0] ^= state[0];
    ptrWordOut[1] ^= state[1];
    ptrWordOut[2] ^= state[2];
    ptrWordOut[3] ^= state[3];
    ptrWordOut[4] ^= state[4];
    ptrWordOut[5] ^= state[5];
    ptrWordOut[6] ^= state[6];
    ptrWordOut[7] ^= state[7];
    ptrWordOut[8] ^= state[8];
    ptrWordOut[9] ^= state[9];
    ptrWordOut[10] ^= state[10];
    ptrWordOut[11] ^= state[11];

    //M[rowInOut][col] = M[rowInOut][col] XOR rotW(rand)
    ptrWordInOut[0] ^= state[11];
    ptrWordInOut[1] ^= state[0];
    ptrWordInOut[2] ^= state[1];
    ptrWordInOut[3] ^= state[2];
    ptrWordInOut[4] ^= state[3];
    ptrWordInOut[5] ^= state[4];
    ptrWordInOut[6] ^= state[5];
    ptrWordInOut[7] ^= state[6];
    ptrWordInOut[8] ^= state[7];
    ptrWordInOut[9] ^= state[8];
    ptrWordInOut[10] ^= state[9];
    ptrWordInOut[11] ^= state[10];

       //Goes to next block
       ptrWordOut += BLOCK_LEN_INT64;
       ptrWordInOut += BLOCK_LEN_INT64;
       ptrWordIn += BLOCK_LEN_INT64;
    }
#endif
}