//! \brief //! Returns an "alpha blend" of x scaled by y/255; //! x := x * (y / 255) //! Reorder: x := (x * y) / 255 //! See: http://www.alfredklomp.com/programming/sse-intrinsics/ //! inline __m128i _mm_scale_epu8(__m128i x, __m128i y) { // Unpack x and y into 16-bit uints: __m128i xlo = _mm_unpacklo_epi8(x, _mm_setzero_si128()); __m128i ylo = _mm_unpacklo_epi8(y, _mm_setzero_si128()); __m128i xhi = _mm_unpackhi_epi8(x, _mm_setzero_si128()); __m128i yhi = _mm_unpackhi_epi8(y, _mm_setzero_si128()); // Multiply x with y, keeping the low 16 bits: xlo = _mm_mullo_epi16(xlo, ylo); xhi = _mm_mullo_epi16(xhi, yhi); // Divide by 255: xlo = _mm_div255_epu16(xlo); xhi = _mm_div255_epu16(xhi); // Repack the 16-bit uints to clamped 8-bit values: return _mm_packus_epi16(xlo, xhi); }
static bool test_mm_div255_epu16 (void) { bool pass = true; // Only works when i < 256*255 = 65280: // (result is 8-bit): puts("_mm_div255_epu16"); for (int i = 0; i < 0xFF00; i++) { uint16_t c = _mm_extract_epi16(_mm_div255_epu16(_mm_set1_epi16(i)), 1); if (c != (i / 255)) { printf("FAIL: div255(%d), got %d, expected %d\n", i, c, (i / 255)); pass = false; } } return pass; }