예제 #1
0
int main() {
	const ssize_t A = 3;
	const size_t Awidth = 2;
	const size_t Dwidth = 4;
	const ssize_t Dmin = (-1) * (1ll << (Dwidth - 1));
	const ssize_t Dmax = (1ll << (Dwidth - 1)) - 1;
	const ssize_t Cwidth = Awidth + Dwidth;
	const ssize_t AInv = ext_euklidean(A, Cwidth) & ((1ll << Cwidth) - 1);
	const size_t numCodewords = (1ull << Cwidth);
	std::cout << "numCodewords: " << numCodewords << std::endl;
	const size_t numMasks = numCodewords / (sizeof(int) * 4); // How many masks will we generate?
	int * pNonCodewordMasks = new int[numMasks];
	const int16_t c = ~((1ll << (Cwidth - 1)) - 1);
	std::cout << "c = 0x" << std::hex << c << std::dec << std::endl;
	for (ssize_t i = 0, cw = c, posMask = 0; i < numCodewords; ++posMask) {
		int tmpMask = 0;
		for (ssize_t k = 0; k < 16; ++k, ++cw, ++i) {
			if ((cw % A) != 0) { // we want the non-codewords
				// std::cout << "cw % A != 0: " << cw << std::endl;
				tmpMask |= (1ll << (k * 2)) | (1ll << (k * 2 + 1)); // expand to 32 bits, because AVX2 cannot movemask across lanes to 16 bits
			}
		}
		pNonCodewordMasks[posMask] = tmpMask;
	}
	std::cout << "numMasks: " << numMasks << std::endl;
	std::cout << "non-codeword-masks: 0x" << std::hex << std::setfill('0');
	for (size_t posMask = 0; posMask < numMasks; ++posMask) {
		std::cout << std::setw(8) << pNonCodewordMasks[posMask] << ':';
	}
	std::cout << std::dec << std::endl << std::setfill(' ');
	auto mmCodewords = _mm256_set_epi16(c+15, c+14, c+13, c+12, c+11, c+10, c+9, c+8, c+7, c+6, c+5, c+4, c+3, c+2, c+1, c);
	auto mmAddUp = _mm256_set1_epi16(16);
	auto mmAinv = _mm256_set1_epi16(AInv);
	auto mmDmin = _mm256_set1_epi16(Dmin);
	auto mmDmax = _mm256_set1_epi16(Dmax);
	const size_t posEnd = (1ull << Cwidth);
	__m256i mmFillUp[] = {_mm256_set1_epi16(0), _mm256_set1_epi16(~((1ll << Cwidth) - 1))}; // fill up all non-codeword bits with 1's if necessary
	std::cout << "posEnd = 0x" << std::hex << posEnd << std::dec << std::endl;
	std::cout << std::setfill('0') << std::hex;
	for(size_t pos = 15, posMask = 0; pos < posEnd; pos += 16, ++posMask) {
		auto isNeg = 0x1 & _mm256_movemask_epi8(_mm256_cmpgt_epi16(mmFillUp[0], mmCodewords));
		auto mm1 = _mm256_or_si256(_mm256_mullo_epi16(mmCodewords, mmAinv), mmFillUp[isNeg]);
		auto mm2 = _mm256_cmpgt_epi16(mm1, mmDmin);
		auto mm3 = _mm256_cmpgt_epi16(mmDmax, mm1);
		auto mm4 = _mm256_cmpeq_epi16(mmDmax, mm1);
		auto mm5 = _mm256_or_si256(mm3, mm4);
		auto mm6 = _mm256_and_si256(mm2, mm5);
		auto mask = _mm256_movemask_epi8(mm6);
		if (mask & pNonCodewordMasks[posMask]) {
			std::cout << "BAD @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl;
		} else {
			std::cout << "OK @0x" << std::setw((Cwidth + 7) / 8) << pos << ": 0x" << mask << " & 0x" << pNonCodewordMasks[posMask] << " = 0x" << (mask & pNonCodewordMasks[posMask]) << std::endl;
		}
		mmCodewords = _mm256_add_epi16(mmCodewords, mmAddUp);
	}
	std::cout << std::setfill(' ') << std::dec;
}
예제 #2
0
size_t __FASTCALL strlen_fast_v2_avx(const char * str)
{
    size_t len;
    register __m256i zero32, src32_low, src32_high;
    register size_t zero_mask_low, zero_mask_high;
    register uint64_t zero_mask;
    unsigned long zero_index;
    register const char * cur = str;
    // Set the zero masks (32 bytes).
    INIT_ZERO_32(zero32);
    zero32 = _mm256_xor_si256(zero32, zero32);
    // Get the misalignment bytes last 6 bits.
    size_t misalignment = (size_t)cur & 0x3F;
    // If the misalignment bytes is < 32 bytes?
    if (misalignment < 0x20) {
        if (misalignment == 0) {
            // If misalignment is 0, skip this step.
            goto main_loop;
        }
        // Align address to 64 bytes for main loop.
        cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F));
        // Load 32 bytes from target string to YMM register.
        src32_low  = _mm256_load_si256((__m256i *)(cur));
        src32_high = _mm256_load_si256((__m256i *)(cur + 32));
        // Compare with zero32 masks per byte.
        src32_low  = _mm256_cmpeq_epi8(src32_low,  zero32);
        src32_high = _mm256_cmpeq_epi8(src32_high, zero32);
        // Package the compare result (32 bytes) to 32 bits.
        zero_mask_low  = (size_t)_mm256_movemask_epi8(src32_low);
        zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high);
        // Remove last missalign bits.
        zero_mask_low >>= misalignment;
        zero_mask_low <<= misalignment;
        if (zero_mask_low != 0) {
            // Get the index of the first bit on set to 1.
            __BitScanForward(zero_index, zero_mask_low);
            goto strlen_exit;
        }
        else if (zero_mask_high != 0) {
            // Get the index of the first bit on set to 1.
            __BitScanForward(zero_index, zero_mask_high);
            zero_index += 32;
            goto strlen_exit;
        }
        // Align address to the next 64 bytes for main loop.
        cur += 64;
    }
    else {
예제 #3
0
파일: norx.c 프로젝트: 0x64616E69656C/norx
int norx_aead_decrypt(
    unsigned char *m, size_t *mlen,
    const unsigned char *a, size_t alen,
    const unsigned char *c, size_t clen,
    const unsigned char *z, size_t zlen,
    const unsigned char *nonce,
    const unsigned char *key
    )
{
    const __m256i K = LOADU(key);
    __m256i A, B, C, D;

    if (clen < BYTES(NORX_T)) { return -1; }

    *mlen = clen - BYTES(NORX_T);

    INITIALISE(A, B, C, D, nonce, K);
    ABSORB_DATA(A, B, C, D, a, alen, HEADER_TAG);
    DECRYPT_DATA(A, B, C, D, m, c, clen - BYTES(NORX_T));
    ABSORB_DATA(A, B, C, D, z, zlen, TRAILER_TAG);
    FINALISE(A, B, C, D, K);

    /* Verify tag */
    D = _mm256_cmpeq_epi8(D, LOADU(c + clen - BYTES(NORX_T)));
    return (((_mm256_movemask_epi8(D) & 0xFFFFFFFFULL) + 1) >> 32) - 1;
}
예제 #4
0
/* Shuffle bits within the bytes of eight element blocks. */
int64_t bshuf_shuffle_bit_eightelem_AVX(void* in, void* out, const size_t size,
         const size_t elem_size) {

    CHECK_MULT_EIGHT(size);

    // With a bit of care, this could be written such that such that it is
    // in_buf = out_buf safe.
    char* in_b = (char*) in;
    char* out_b = (char*) out;

    size_t ii, jj, kk;
    size_t nbyte = elem_size * size;

    __m256i ymm;
    int32_t bt;

    if (elem_size % 4) {
        return bshuf_shuffle_bit_eightelem_SSE(in, out, size, elem_size);
    } else {
        for (jj = 0; jj + 31 < 8 * elem_size; jj += 32) {
            for (ii = 0; ii + 8 * elem_size - 1 < nbyte;
                    ii += 8 * elem_size) {
                ymm = _mm256_loadu_si256((__m256i *) &in_b[ii + jj]);
                for (kk = 0; kk < 8; kk++) {
                    bt = _mm256_movemask_epi8(ymm);
                    ymm = _mm256_slli_epi16(ymm, 1);
                    size_t ind = (ii + jj / 8 + (7 - kk) * elem_size);
                    * (int32_t *) &out_b[ind] = bt;
                }
            }
        }
    }
    return size * elem_size;
}
예제 #5
0
/* Transpose bits within bytes. */
int64_t bshuf_trans_bit_byte_AVX(void* in, void* out, const size_t size,
         const size_t elem_size) {

    size_t ii, kk;
    char* in_b = (char*) in;
    char* out_b = (char*) out;
    int32_t* out_i32;

    size_t nbyte = elem_size * size;

    int64_t count;

    __m256i ymm;
    int32_t bt;

    for (ii = 0; ii + 31 < nbyte; ii += 32) {
        ymm = _mm256_loadu_si256((__m256i *) &in_b[ii]);
        for (kk = 0; kk < 8; kk++) {
            bt = _mm256_movemask_epi8(ymm);
            ymm = _mm256_slli_epi16(ymm, 1);
            out_i32 = (int32_t*) &out_b[((7 - kk) * nbyte + ii) / 8];
            *out_i32 = bt;
        }
    }
    count = bshuf_trans_bit_byte_remainder(in, out, size, elem_size,
            nbyte - nbyte % 32);
    return count;
}
int main(void)
{
    for (int a = 0; a < 1000; a++)
    {
        for (int b = 0; b < 1000; b++)
        {
            uint32_t lhs_ab = 1000 * 1000 * a + 1000 * b;
            m256u_t lhs_ab_v = {.u = {lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab, lhs_ab}};
            uint32_t rhs_ab = a * a * a + b * b * b;
            m256u_t rhs_ab_v = {.u = {rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab, rhs_ab}};
            m256u_t c_v = {.u = {0, 1, 2, 3, 4, 5, 6, 7}};
            m256u_t c_inc_v = {.u = {8, 8, 8, 8, 8, 8, 8, 8}};
            m256u_t lhs_v, rhs_v, cmp_v;
            for (int c = 0; c < 1000; c += 8)
            {
                lhs_v.m = _mm256_add_epi32(lhs_ab_v.m, c_v.m);
                rhs_v.m = _mm256_mullo_epi32(c_v.m, c_v.m);
                rhs_v.m = _mm256_mullo_epi32(rhs_v.m, c_v.m);
                rhs_v.m = _mm256_add_epi32(rhs_v.m, rhs_ab_v.m);
                cmp_v.m = _mm256_cmpeq_epi32(lhs_v.m, rhs_v.m);
                if (_mm256_movemask_epi8(cmp_v.m))
                {
                    for (int i = 0; i < 8; i++)
                        if (cmp_v.u[i] != 0)
                            printf("%09u\n", lhs_v.u[i]);
                }
                c_v.m = _mm256_add_epi32(c_v.m, c_inc_v.m);
            }
        }
    }
    return 0;
}
예제 #7
0
void* xmemchr(const void* src, int c, size_t n)
{

	if (n < 32) {
		return xmemchr_tiny(src, c, n);
	}


	__m256i ymm0 = _mm256_set1_epi8((char)c), ymm1;
	int mask;

	size_t rem = n % 32;
	n /= 32;

	for (size_t i = 0; i < n; i++) {
		ymm1 = _mm256_loadu_si256((const __m256i*)src + i);
		ymm1 = _mm256_cmpeq_epi8(ymm1, ymm0);
		mask = _mm256_movemask_epi8(ymm1);
		if (mask) {
			__asm__("bsfl %0, %0\n\t"
			        :"=r"(mask)
			        :"0"(mask)
			       );
			return (void*)((unsigned long)((const __m256i*)src + i) + mask);
		}
	}

	return xmemchr_tiny((const void*)((unsigned long)src + n), c, rem);
}
예제 #8
0
inline void matrix32x8::transpose(square128& output, int x, int y)
{
    for (int j = 0; j < 8; j++)
    {
        int row = _mm256_movemask_epi8(whole);
        whole = _mm256_slli_epi64(whole, 1);

        // _mm_movemask_epi8 uses most significant bit, hence +7-j
        output.words[8*x+7-j][y] = row;
    }
}
예제 #9
0
파일: avx2.cpp 프로젝트: WojciechMula/toys
void bitmask_avx2(uint32_t* ptr, size_t n, uint32_t key, uint8_t* out) {

    uint32_t* output = (uint32_t*)out;

    const size_t N = 8*4; // unrolled 4 times
    const size_t chunks = n / N;
    const size_t tail   = n % N;

    const __m256i vkey = _mm256_set1_epi32(key);
    
    for (size_t i=0; i < chunks; i++) {
        
        const __m256i in0 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 0*8));
        const __m256i in1 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 1*8));
        const __m256i in2 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 2*8));
        const __m256i in3 = _mm256_loadu_si256((const __m256i*)(ptr + i*N + 3*8));

        const __m256i eq0 = _mm256_cmpeq_epi32(in0, vkey);
        const __m256i eq1 = _mm256_cmpeq_epi32(in1, vkey);
        const __m256i eq2 = _mm256_cmpeq_epi32(in2, vkey);
        const __m256i eq3 = _mm256_cmpeq_epi32(in3, vkey);

        // eq0 = [a0 a1 a2 a3 a4 a5 a6 a7] (packed dword)
        // eq1 = [b0 b1 b2 b3 b4 b5 b6 b7] (packed dword)
        // eq2 = [c0 c1 c2 c3 c4 c5 c6 c7] (packed dword)
        // eq3 = [d0 d1 d2 d3 d4 d5 d6 d7] (packed dword)

        //  t0 = [a0 a1 a2 a3 c0 c1 c2 c3 a4 a5 a6 a7 c4 c5 c6 c7] (packed word)
        const __m256i t0  = _mm256_packs_epi32(eq0, eq2);
        // m02 = [a0 a1 a2 a3 a4 a5 a6 a7 c0 c1 c2 c3 c4 c5 c6 c7] (packed word)
        const __m256i m02 = _mm256_permutevar8x32_epi32(t0,
                                _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7));

        //  t0 = [b0 b1 b2 b3 d0 d1 d2 d3 b4 b5 b6 b7 d4 d5 d6 d7] (packed word)
        const __m256i t1 = _mm256_packs_epi32(eq1, eq3);
        // m13 = [b0 b1 b2 b3 b4 b5 b6 b7 d0 d1 d2 d3 d4 d5 d6 d7] (packed word)
        const __m256i m13 = _mm256_permutevar8x32_epi32(t1,
                                _mm256_setr_epi32(0, 1, 4, 5, 2, 3, 6, 7));

        // m   = [a0..7 b0..7 c0..7 d0..7] (packed byte)
        const __m256i m   = _mm256_packs_epi16(m02, m13);

        *output++ = _mm256_movemask_epi8(m);
    }

    if (tail > 0) {
        bitmask_better_2(ptr + chunks*N, tail, key, out + chunks*N);
    }
}
예제 #10
0
size_t xstrlen(const char* src)
{
    __m256i m0 = _mm256_setzero_si256();
    __m256i m1 ;
    int mask;
    for (size_t count = 0;; count += 32){
        m1 = _mm256_loadu_si256((const __m256i*)((unsigned long)src + count));
        m1 = _mm256_cmpeq_epi8(m1, m0);
        mask = _mm256_movemask_epi8(m1);
        if (mask != 0) {
            __asm__("bsfl %0, %0\n\t"
                    :"=r"(mask)
                    :"r"(mask)
                   );
            return count + (size_t)mask;
        }
    }
/*
 * Do or undo the 'E8' preprocessing used in LZX.  Before compression, the
 * uncompressed data is preprocessed by changing the targets of x86 CALL
 * instructions from relative offsets to absolute offsets.  After decompression,
 * the translation is undone by changing the targets of x86 CALL instructions
 * from absolute offsets to relative offsets.
 *
 * Note that despite its intent, E8 preprocessing can be done on any data even
 * if it is not actually x86 machine code.  In fact, E8 preprocessing appears to
 * always be used in LZX-compressed resources in WIM files; there is no bit to
 * indicate whether it is used or not, unlike in the LZX compressed format as
 * used in cabinet files, where a bit is reserved for that purpose.
 *
 * E8 preprocessing is disabled in the last 6 bytes of the uncompressed data,
 * which really means the 5-byte call instruction cannot start in the last 10
 * bytes of the uncompressed data.  This is one of the errors in the LZX
 * documentation.
 *
 * E8 preprocessing does not appear to be disabled after the 32768th chunk of a
 * WIM resource, which apparently is another difference from the LZX compression
 * used in cabinet files.
 *
 * E8 processing is supposed to take the file size as a parameter, as it is used
 * in calculating the translated jump targets.  But in WIM files, this file size
 * is always the same (LZX_WIM_MAGIC_FILESIZE == 12000000).
 */
static void
lzx_e8_filter(u8 *data, u32 size, void (*process_target)(void *, s32))
{

#if !defined(__SSE2__) && !defined(__AVX2__)
	/*
	 * A worthwhile optimization is to push the end-of-buffer check into the
	 * relatively rare E8 case.  This is possible if we replace the last six
	 * bytes of data with E8 bytes; then we are guaranteed to hit an E8 byte
	 * before reaching end-of-buffer.  In addition, this scheme guarantees
	 * that no translation can begin following an E8 byte in the last 10
	 * bytes because a 4-byte offset containing E8 as its high byte is a
	 * large negative number that is not valid for translation.  That is
	 * exactly what we need.
	 */
	u8 *tail;
	u8 saved_bytes[6];
	u8 *p;

	if (size <= 10)
		return;

	tail = &data[size - 6];
	memcpy(saved_bytes, tail, 6);
	memset(tail, 0xE8, 6);
	p = data;
	for (;;) {
		while (*p != 0xE8)
			p++;
		if (p >= tail)
			break;
		(*process_target)(p + 1, p - data);
		p += 5;
	}
	memcpy(tail, saved_bytes, 6);
#else
	/* SSE2 or AVX-2 optimized version for x86_64  */

	u8 *p = data;
	u64 valid_mask = ~0;

	if (size <= 10)
		return;
#ifdef __AVX2__
#  define ALIGNMENT_REQUIRED 32
#else
#  define ALIGNMENT_REQUIRED 16
#endif

	/* Process one byte at a time until the pointer is properly aligned.  */
	while ((uintptr_t)p % ALIGNMENT_REQUIRED != 0) {
		if (p >= data + size - 10)
			return;
		if (*p == 0xE8 && (valid_mask & 1)) {
			(*process_target)(p + 1, p - data);
			valid_mask &= ~0x1F;
		}
		p++;
		valid_mask >>= 1;
		valid_mask |= (u64)1 << 63;
	}

	if (data + size - p >= 64) {

		/* Vectorized processing  */

		/* Note: we use a "trap" E8 byte to eliminate the need to check
		 * for end-of-buffer in the inner loop.  This byte is carefully
		 * positioned so that it will never be changed by a previous
		 * translation before it is detected.  */

		u8 *trap = p + ((data + size - p) & ~31) - 32 + 4;
		u8 saved_byte = *trap;
		*trap = 0xE8;

		for (;;) {
			u32 e8_mask;
			u8 *orig_p = p;
		#ifdef __AVX2__
			const __m256i e8_bytes = _mm256_set1_epi8(0xE8);
			for (;;) {
				__m256i bytes = *(const __m256i *)p;
				__m256i cmpresult = _mm256_cmpeq_epi8(bytes, e8_bytes);
				e8_mask = _mm256_movemask_epi8(cmpresult);
				if (e8_mask)
					break;
				p += 32;
			}
		#else
			const __m128i e8_bytes = _mm_set1_epi8(0xE8);
			for (;;) {
				/* Read the next 32 bytes of data and test them
				 * for E8 bytes.  */
				__m128i bytes1 = *(const __m128i *)p;
				__m128i bytes2 = *(const __m128i *)(p + 16);
				__m128i cmpresult1 = _mm_cmpeq_epi8(bytes1, e8_bytes);
				__m128i cmpresult2 = _mm_cmpeq_epi8(bytes2, e8_bytes);
				u32 mask1 = _mm_movemask_epi8(cmpresult1);
				u32 mask2 = _mm_movemask_epi8(cmpresult2);
				/* The masks have a bit set for each E8 byte.
				 * We stay in this fast inner loop as long as
				 * there are no E8 bytes.  */
				if (mask1 | mask2) {
					e8_mask = mask1 | (mask2 << 16);
					break;
				}
				p += 32;
			}
		#endif

			/* Did we pass over data with no E8 bytes?  */
			if (p != orig_p)
				valid_mask = ~0;

			/* Are we nearing end-of-buffer?  */
			if (p == trap - 4)
				break;

			/* Process the E8 bytes.  However, the AND with
			 * 'valid_mask' ensures we never process an E8 byte that
			 * was itself part of a translation target.  */
			while ((e8_mask &= valid_mask)) {
				unsigned bit = bsf32(e8_mask);
				(*process_target)(p + bit + 1, p + bit - data);
				valid_mask &= ~((u64)0x1F << bit);
			}

			valid_mask >>= 32;
			valid_mask |= 0xFFFFFFFF00000000;
			p += 32;
		}

		*trap = saved_byte;
	}
예제 #12
0
void vec_i8_replace(int8_t *p, size_t n, int8_t val, int8_t substitute)
{
#ifdef COREARRAY_SIMD_SSE2

	// header 1, 16-byte aligned
	size_t h = (16 - ((size_t)p & 0x0F)) & 0x0F;
	for (; (n > 0) && (h > 0); n--, h--, p++)
		if (*p == val) *p = substitute;

	// body, SSE2
	const __m128i mask = _mm_set1_epi8(val);
	const __m128i sub  = _mm_set1_epi8(substitute);

#   ifdef COREARRAY_SIMD_AVX2

	// header 2, 32-byte aligned
	if ((n >= 16) && ((size_t)p & 0x10))
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
		{
			_mm_store_si128((__m128i *)p,
				_mm_or_si128(_mm_and_si128(c, sub), _mm_andnot_si128(c, v)));
		}
		n -= 16; p += 16;
	}

	const __m256i mask2 = _mm256_set1_epi8(val);
	const __m256i sub32 = _mm256_set1_epi8(substitute);
	const __m256i zero = _mm256_setzero_si256();
	const __m256i ones = _mm256_cmpeq_epi64(zero, zero);

	for (; n >= 32; n-=32, p+=32)
	{
		__m256i v = _mm256_load_si256((__m256i const*)p);
		__m256i c = _mm256_cmpeq_epi8(v, mask2);
		if (_mm256_movemask_epi8(c))
		{
			// TODO
			_mm256_store_si256((__m256i *)p,
				_mm256_or_si256(_mm256_and_si256(c, sub32),
				_mm256_andnot_si256(c, v)));
		}
	}

#   endif

	for (; n >= 16; n-=16, p+=16)
	{
		__m128i v = _mm_load_si128((__m128i const*)p);
		__m128i c = _mm_cmpeq_epi8(v, mask);
		if (_mm_movemask_epi8(c))
			_mm_maskmoveu_si128(sub, c, (char*)p);
	}

#endif

	// tail
	for (; n > 0; n--, p++)
		if (*p == val) *p = substitute;
}
예제 #13
0
/* Function:  p7_MSVFilter()
 * Synopsis:  Calculates MSV score, vewy vewy fast, in limited precision.
 *
 * Purpose:   Calculates an approximation of the MSV score for sequence
 *            <dsq> of length <L> residues, using optimized profile <om>,
 *            and the one-row DP matrix <ox>. Return the 
 *            estimated MSV score (in nats) in <ret_sc>.
 *            
 *            Score may overflow (and will, on high-scoring
 *            sequences), but will not underflow.
 *            
 *            <ox> will be resized if needed. It's fine if it was
 *            just <_Reuse()'d> from a previous, smaller profile.
 *            
 *            The model may be in any mode, because only its match
 *            emission scores will be used. The MSV filter inherently
 *            assumes a multihit local mode, and uses its own special
 *            state transition scores, not the scores in the profile.
 *
 * Args:      dsq     - digital target sequence, 1..L
 *            L       - length of dsq in residues          
 *            om      - optimized profile
 *            ox      - filter DP matrix (one row)
 *            ret_sc  - RETURN: MSV score (in nats)          
 *                      
 * Returns:   <eslOK> on success.
 *            <eslERANGE> if the score overflows the limited range; in
 *            this case, this is a high-scoring hit.
 *            <ox> may have been resized.
 *
 * Throws:    <eslEMEML> if <ox> reallocation fails.
 */
int
p7_MSVFilter_avx(const ESL_DSQ *dsq, int L, const P7_OPROFILE *om, P7_FILTERMX *ox, float *ret_sc)
{
#ifdef HAVE_AVX2 
  uint8_t  xJ;                     /* special states' scores                  */
  register __m256i mpv_AVX;            /* previous row values                                       */
  register __m256i xEv_AVX;      /* E state: keeps max for Mk->E as we go                     */
  register __m256i xBv_AVX;      /* B state: splatted vector of B[i-1] for B->Mk calculations */
  register __m256i sv_AVX;       /* temp storage of 1 curr row value in progress              */
  register __m256i biasv_AVX;    /* emission bias in a vector                                 */
  __m256i *dp_AVX;               /* the dp row memory                                         */
  __m256i *rsc_AVX;        /* will point at om->rbv[x] for residue x[i]                 */
  __m256i xJv_AVX;                     /* vector for states score                                   */
  __m256i tjbmv_AVX;                   /* vector for cost of moving {JN}->B->M                      */
  __m256i tecv_AVX;                    /* vector for E->C  cost                                     */
  __m256i basev_AVX;                   /* offset for scores                                         */
  __m256i ceilingv_AVX;                /* saturated simd value used to test for overflow            */
  __m256i tempv_AVX;                   /* work vector                                               */
  int Q_AVX        = P7_NVB_AVX(om->M);    /* segment length: # of vectors                              */
  int q_AVX;         /* counter over vectors 0..nq-1                              */

  int i;         /* counter over sequence positions 1..L                      */
  int     cmp;
  int     status;
//printf("Starting MSVFilter\n");
  /* Contract checks */
  ESL_DASSERT1(( om->mode == p7_LOCAL )); /* Production code assumes multilocal mode w/ length model <L> */
  ESL_DASSERT1(( om->L    == L ));	  /*  ... and it's easy to forget to set <om> that way           */
  ESL_DASSERT1(( om->nj   == 1.0f ));	  /*  ... hence the check                                        */
                                          /*  ... which you can disable, if you're playing w/ config     */
  /* note however that it makes no sense to run MSV w/ a model in glocal mode                            */

  /* Try highly optimized Knudsen SSV filter first. 
   * Note that SSV doesn't use any main memory (from <ox>) at all! 
  */
 //extern uint64_t SSV_time;
 uint64_t filter_start_time = __rdtsc();
   status = p7_SSVFilter_avx(dsq, L, om, ret_sc);
    uint64_t filter_end_time = __rdtsc();
  //SSV_time += (filter_end_time - filter_start_time);   
   if (status != eslENORESULT) return status;
   extern uint64_t full_MSV_calls;
  
  full_MSV_calls++;
 
  /* Resize the filter mx as needed */
  if (( status = p7_filtermx_GrowTo(ox, om->M))    != eslOK) ESL_EXCEPTION(status, "Reallocation of MSV filter matrix failed");

  dp_AVX = ox->dp_AVX;  /* ditto this */

  /* Matrix type and size must be set early, not late: debugging dump functions need this information. */
  ox->M    = om->M;
  ox->type = p7F_MSVFILTER;

  /* Initialization. In offset unsigned arithmetic, -infinity is 0, and 0 is om->base.  */

  biasv_AVX = _mm256_set1_epi8((int8_t) om->bias_b); /* yes, you can set1() an unsigned char vector this way */

  for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++) dp_AVX[q_AVX] = _mm256_setzero_si256(); 
  /* saturate simd register for overflow test */

  ceilingv_AVX = _mm256_cmpeq_epi8(biasv_AVX, biasv_AVX);
  basev_AVX    = _mm256_set1_epi8((int8_t) om->base_b);
  tjbmv_AVX    = _mm256_set1_epi8((int8_t) om->tjb_b + (int8_t) om->tbm_b);
  tecv_AVX     = _mm256_set1_epi8((int8_t) om->tec_b);
  xJv_AVX      = _mm256_subs_epu8(biasv_AVX, biasv_AVX);
  xBv_AVX      = _mm256_subs_epu8(basev_AVX, tjbmv_AVX);

#ifdef p7_DEBUGGING
  if (ox->do_dumping)
    {
      uint8_t xB;
      xB = _mm_extract_epi16(xBv, 0);
      xJ = _mm_extract_epi16(xJv, 0);
      p7_filtermx_DumpMFRow(ox, 0, 0, 0, xJ, xB, xJ);
    }
#endif


  for (i = 1; i <= L; i++)  /* Outer loop over residues*/
    {

      rsc_AVX = om->rbv_AVX[dsq[i]];
      xEv_AVX = _mm256_setzero_si256();      

      /* Right shifts by 1 byte. 4,8,12,x becomes x,4,8,12. 
       * Because ia32 is littlendian, this means a left bit shift.
       * Zeros shift on automatically, which is our -infinity.
       */
       __m256i dp_temp_AVX = dp_AVX[Q_AVX -1];
      mpv_AVX = esl_avx_leftshift_one(dp_temp_AVX);
      
      for (q_AVX = 0; q_AVX < Q_AVX; q_AVX++)
      {
        /* Calculate new MMXo(i,q); don't store it yet, hold it in sv. */
        sv_AVX   = _mm256_max_epu8(mpv_AVX, xBv_AVX);
        sv_AVX   = _mm256_adds_epu8(sv_AVX, biasv_AVX);
        sv_AVX   = _mm256_subs_epu8(sv_AVX, *rsc_AVX);   rsc_AVX++;
        xEv_AVX  = _mm256_max_epu8(xEv_AVX, sv_AVX);

        mpv_AVX   = dp_AVX[q_AVX];      /* Load {MDI}(i-1,q) into mpv */
        dp_AVX[q_AVX] = sv_AVX;           /* Do delayed store of M(i,q) now that memory is usable */
      }

      /* test for the overflow condition */
      tempv_AVX = _mm256_adds_epu8(xEv_AVX, biasv_AVX);
      tempv_AVX = _mm256_cmpeq_epi8(tempv_AVX, ceilingv_AVX);
      cmp   = _mm256_movemask_epi8(tempv_AVX);

      /* Now the "special" states, which start from Mk->E (->C, ->J->B)
       * Use shuffles instead of shifts so when the last max has completed,
       * the last four elements of the simd register will contain the
       * max value.  Then the last shuffle will broadcast the max value
       * to all simd elements.
       */

      xEv_AVX = _mm256_set1_epi8(esl_avx_hmax_epu8(xEv_AVX));  // broadcast the max byte from original xEv_AVX
      // to all bytes of xEv_AVX

      /* immediately detect overflow */
      if (cmp != 0x0000) {
   //            MSV_end_time = __rdtsc();
   //     MSV_time += (MSV_end_time - MSV_start_time);
        *ret_sc = eslINFINITY; return eslERANGE; }

      xEv_AVX = _mm256_subs_epu8(xEv_AVX, tecv_AVX);
      xJv_AVX = _mm256_max_epu8(xJv_AVX,xEv_AVX);
      xBv_AVX = _mm256_max_epu8(basev_AVX, xJv_AVX);
      xBv_AVX = _mm256_subs_epu8(xBv_AVX, tjbmv_AVX);

#ifdef p7_DEBUGGING
      if (ox->do_dumping)
	{
	  uint8_t xB, xE;
	  xB = _mm_extract_epi16(xBv, 0);
	  xE = _mm_extract_epi16(xEv, 0);
	  xJ = _mm_extract_epi16(xJv, 0);
	  p7_filtermx_DumpMFRow(ox, i, xE, 0, xJ, xB, xJ);
	}
#endif
    } /* end loop over sequence residues 1..L */

  /* finally C->T, and add our missing precision on the NN,CC,JJ back */
  xJ =  _mm256_extract_epi8(xJv_AVX, 0);

  *ret_sc = ((float) (xJ - om->tjb_b) - (float) om->base_b);
  *ret_sc /= om->scale_b;
  *ret_sc -= 3.0; /* that's ~ L \log \frac{L}{L+3}, for our NN,CC,JJ */
    /*     MSV_end_time = __rdtsc();
        MSV_time += (MSV_end_time - MSV_start_time); */

  return eslOK;
  #endif
  #ifndef HAVE_AVX2
  return eslENORESULT;  // Stub so we have something to link if we build without AVX2 support
  #endif
  }
예제 #14
0
size_t __FASTCALL strlen_fast_v1b_avx(const char * str)
{
    size_t len;
    register __m256i zero32, src32_low, src32_high;
    register size_t zero_mask_low, zero_mask_high;
    register uint64_t zero_mask;
    unsigned long zero_index;
    register const char * cur = str;
    // Get the misalignment bytes last 6 bits.
    size_t misalignment = (size_t)str & 0x3F;
    if (misalignment != 0) {
        misalignment = (size_t)str & 0x1F;
        // Scan the null terminator in first missalign bytes.
        register const char * end = cur + ((size_t)16UL - misalignment);
        while (cur < end) {
            // Find out the null terminator.
            if (*cur == '\0') {
                return (size_t)(cur - str);
            }
            cur++;
        }

        // Align address to 64 bytes for main loop.
        end = (const char *)((size_t)str & ((size_t)~(size_t)0x3F)) + 64;

        register __m128i zero16, src16;
        register uint32_t zero_mask16;

        // Set the zero masks (16 bytes).
        INIT_ZERO_16(zero16);
        zero16 = _mm_xor_si128(zero16, zero16);

        // Minor 16 bytes loop
        while (cur < end) {
            // Load the src 16 bytes to XMM register
            src16  = _mm_load_si128((__m128i *)(cur));
            // Compare with zero16 masks per byte.
            src16  = _mm_cmpeq_epi8(src16,  zero16);
            // Package the compare result (16 bytes) to 16 bits.
            zero_mask16 = (uint32_t)_mm_movemask_epi8(src16);

            // If it have any one bit is 1, mean it have a null terminator
            // inside this scaned strings (per 16 bytes).
            if (zero_mask16 != 0) {
                // Get the index of the first bit on set to 1.
                __BitScanForward(zero_index, zero_mask16);
                goto strlen_exit;
            }
            // One minor loop scan 16 bytes.
            cur += 16;
        }
    }
    // Set the zero masks (32 bytes).
    INIT_ZERO_32(zero32);
    zero32 = _mm256_xor_si256(zero32, zero32);

    // Main loop
    do {
        // Load the src 32 bytes to XMM register
        src32_low  = _mm256_load_si256((__m256i *)(cur));
        src32_high = _mm256_load_si256((__m256i *)(cur + 32));
        // Compare with zero32 masks per byte.
        src32_low  = _mm256_cmpeq_epi8(src32_low,  zero32);
        src32_high = _mm256_cmpeq_epi8(src32_high, zero32);
        // Package the compare result (32 bytes) to 16 bits.
        zero_mask_low  = (size_t)_mm256_movemask_epi8(src32_low);
        zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high);
#if defined(_WIN64) || defined(WIN64) || defined(_M_X64) || defined(_M_AMD64) \
 || defined(_M_IA64) || defined(__amd64__) || defined(__x86_64__)
        // Combin the mask of the low 32 bits and high 32 bits.
        zero_mask = (zero_mask_high << 32) | zero_mask_low;

        // If it have any one bit is 1, mean it have a null terminator
        // inside this scaned strings (per 64 bytes).
        if (zero_mask != 0) {
            // Get the index of the first bit on set to 1.
            __BitScanForward64(zero_index, zero_mask);
            break;
        }
#else
        (void)zero_mask;
        // If it have any one bit is 1, mean it have a null terminator
        // inside this scaned strings (per 64 bytes).
        if (zero_mask_low != 0) {
            // Get the index of the first bit on set to 1.
            __BitScanForward(zero_index, zero_mask_low);
            break;
        }
        else if (zero_mask_high != 0) {
            // Get the index of the first bit on set to 1.
            __BitScanForward(zero_index, zero_mask_high);
            zero_index += 32;
            break;
        }
#endif // _M_X64 || __x86_64__
        // One loop scan 64 bytes.
        cur += 64;
    } while (1);

strlen_exit:
    len = cur - str;
    len += zero_index;
    return len;
}
예제 #15
0
int test_mm256_movemask_epi8(__m256i a) {
  // CHECK-LABEL: test_mm256_movemask_epi8
  // CHECK: call i32 @llvm.x86.avx2.pmovmskb(<32 x i8> %{{.*}})
  return _mm256_movemask_epi8(a);
}
예제 #16
0
int test_mm256_movemask_epi8(__m256i a) {
  // CHECK: @llvm.x86.avx2.pmovmskb
  return _mm256_movemask_epi8(a);
}
예제 #17
0
            zero_index += 32;
            goto strlen_exit;
        }
        // Align address to the next 64 bytes for main loop.
        cur += 64;
    }
    else {
        // Align address to 64 bytes, and offset 32 bytes for misalignment.
        cur = (const char * )((size_t)cur & ((size_t)~(size_t)0x3F));

        // Load the src 32 bytes to XMM register
        src32_high = _mm256_load_si256((__m256i *)(cur + 32));
        // Compare with zero32 masks per byte.
        src32_high = _mm256_cmpeq_epi8(src32_high, zero32);
        // Package the compare result (32 bytes) to 32 bits.
        zero_mask_high = (size_t)_mm256_movemask_epi8(src32_high);
        // Skip 32 bytes.
        misalignment -= 32;
        // Remove last misalignment bits.
        zero_mask_high >>= misalignment;
        zero_mask_high <<= misalignment;

        // If it have any one bit is 1, mean it have a null terminator
        // inside this scaned strings (per 64 bytes).
        if (zero_mask_high != 0) {
            // Get the index of the first bit on set to 1.
            __BitScanForward(zero_index, zero_mask_high);
            zero_index += 32;
            goto strlen_exit;
        }
        // Align address to the next 64 bytes for main loop.