//! \brief
//! Returns an "alpha blend" of x scaled by y/255;
//! x := x * (y / 255)
//! Reorder: x := (x * y) / 255
//! See: http://www.alfredklomp.com/programming/sse-intrinsics/
//!
inline __m128i
_mm_scale_epu8(__m128i x, __m128i y)
{
    // Unpack x and y into 16-bit uints:
    __m128i xlo = _mm_unpacklo_epi8(x, _mm_setzero_si128());
    __m128i ylo = _mm_unpacklo_epi8(y, _mm_setzero_si128());
    __m128i xhi = _mm_unpackhi_epi8(x, _mm_setzero_si128());
    __m128i yhi = _mm_unpackhi_epi8(y, _mm_setzero_si128());

    // Multiply x with y, keeping the low 16 bits:
    xlo = _mm_mullo_epi16(xlo, ylo);
    xhi = _mm_mullo_epi16(xhi, yhi);

    // Divide by 255:
    xlo = _mm_div255_epu16(xlo);
    xhi = _mm_div255_epu16(xhi);

    // Repack the 16-bit uints to clamped 8-bit values:
    return _mm_packus_epi16(xlo, xhi);
}
static bool
test_mm_div255_epu16 (void)
{
	bool pass = true;

	// Only works when i < 256*255 = 65280:
	// (result is 8-bit):
	puts("_mm_div255_epu16");
	for (int i = 0; i < 0xFF00; i++) {
		uint16_t c = _mm_extract_epi16(_mm_div255_epu16(_mm_set1_epi16(i)), 1);
		if (c != (i / 255)) {
			printf("FAIL: div255(%d), got %d, expected %d\n", i, c, (i / 255));
			pass = false;
		}
	}
	return pass;
}