/* and : 5 or : 1 add : 2 shift : 0 mul : 4 */ void blend_sse_2() { const uint16_t alpha0 = alpha; const uint16_t alpha1 = 255 - alpha; const int n = width * height * 4; const __m128i v_alpha0 = _mm_set1_epi16(alpha0 << 8); const __m128i v_alpha1 = _mm_set1_epi16(alpha1 << 8); const __m128i mask00ff = _mm_set1_epi16((int16_t)0x00ff); const __m128i maskff00 = _mm_set1_epi16((int16_t)0xff00); for (size_t i=0; i < n; i += 16) { __m128i A = _mm_load_si128((__m128i*)(imgA + i)); __m128i B = _mm_load_si128((__m128i*)(imgB + i)); __m128i A0 = _mm_and_si128(A, mask00ff); __m128i B0 = _mm_and_si128(B, mask00ff); __m128i A1 = _mm_and_si128(A, maskff00); __m128i B1 = _mm_and_si128(B, maskff00); A0 = _mm_mulhi_epu16(A0, v_alpha0); B0 = _mm_mulhi_epu16(B0, v_alpha1); A1 = _mm_mulhi_epu16(A1, v_alpha0); B1 = _mm_mulhi_epu16(B1, v_alpha1); __m128i R0 = _mm_add_epi16(A0, B0); __m128i R1 = _mm_and_si128(_mm_add_epi16(A1, B1), maskff00); _mm_store_si128((__m128i*)(data + i), _mm_or_si128(R0, R1)); } }
inline __m128i Convert8DigitsSSE2(uint32_t value) { assert(value <= 99999999); // abcd, efgh = abcdefgh divmod 10000 const __m128i abcdefgh = _mm_cvtsi32_si128(value); const __m128i abcd = _mm_srli_epi64(_mm_mul_epu32(abcdefgh, reinterpret_cast<const __m128i*>(kDiv10000Vector)[0]), 45); const __m128i efgh = _mm_sub_epi32(abcdefgh, _mm_mul_epu32(abcd, reinterpret_cast<const __m128i*>(k10000Vector)[0])); // v1 = [ abcd, efgh, 0, 0, 0, 0, 0, 0 ] const __m128i v1 = _mm_unpacklo_epi16(abcd, efgh); // v1a = v1 * 4 = [ abcd * 4, efgh * 4, 0, 0, 0, 0, 0, 0 ] const __m128i v1a = _mm_slli_epi64(v1, 2); // v2 = [ abcd * 4, abcd * 4, abcd * 4, abcd * 4, efgh * 4, efgh * 4, efgh * 4, efgh * 4 ] const __m128i v2a = _mm_unpacklo_epi16(v1a, v1a); const __m128i v2 = _mm_unpacklo_epi32(v2a, v2a); // v4 = v2 div 10^3, 10^2, 10^1, 10^0 = [ a, ab, abc, abcd, e, ef, efg, efgh ] const __m128i v3 = _mm_mulhi_epu16(v2, reinterpret_cast<const __m128i*>(kDivPowersVector)[0]); const __m128i v4 = _mm_mulhi_epu16(v3, reinterpret_cast<const __m128i*>(kShiftPowersVector)[0]); // v5 = v4 * 10 = [ a0, ab0, abc0, abcd0, e0, ef0, efg0, efgh0 ] const __m128i v5 = _mm_mullo_epi16(v4, reinterpret_cast<const __m128i*>(k10Vector)[0]); // v6 = v5 << 16 = [ 0, a0, ab0, abc0, 0, e0, ef0, efg0 ] const __m128i v6 = _mm_slli_epi64(v5, 16); // v7 = v4 - v6 = { a, b, c, d, e, f, g, h } const __m128i v7 = _mm_sub_epi16(v4, v6); return v7; }
static void average_16(__m128i *sum_0_u16, __m128i *sum_1_u16, const __m128i mul_constants_0, const __m128i mul_constants_1, const int strength, const int rounding, const int weight) { const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); __m128i input_0, input_1; input_0 = _mm_mulhi_epu16(*sum_0_u16, mul_constants_0); input_0 = _mm_adds_epu16(input_0, rounding_u16); input_1 = _mm_mulhi_epu16(*sum_1_u16, mul_constants_1); input_1 = _mm_adds_epu16(input_1, rounding_u16); input_0 = _mm_srl_epi16(input_0, strength_u128); input_1 = _mm_srl_epi16(input_1, strength_u128); input_0 = _mm_min_epu16(input_0, sixteen); input_1 = _mm_min_epu16(input_1, sixteen); input_0 = _mm_sub_epi16(sixteen, input_0); input_1 = _mm_sub_epi16(sixteen, input_1); *sum_0_u16 = _mm_mullo_epi16(input_0, weight_u16); *sum_1_u16 = _mm_mullo_epi16(input_1, weight_u16); }
void unpack_rgb5a1_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t1 = _mm_mulhi_epu16(t1, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0x8000, 0x001F, 0x03E0, 0x7C00, 0x8000, 0x001F, 0x03E0, 0x7C00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x0800, 0x0040, 0x0002, 0x0001, 0x0800, 0x0040, 0x0002)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0x0200, 0x0260, 0x0260, 0x0260, 0x0200, 0x0260, 0x0260, 0x0260)); t2 = _mm_mulhi_epu16(t2, _mm_set_epi16(0xFF00, 0x6ED5, 0x6ED5, 0x6ED5, 0xFF00, 0x6ED5, 0x6ED5, 0x6ED5)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
static void RescalerImportRowShrink_SSE2(WebPRescaler* const wrk, const uint8_t* src) { const int x_sub = wrk->x_sub; int accum = 0; const __m128i zero = _mm_setzero_si128(); const __m128i mult0 = _mm_set1_epi16(x_sub); const __m128i mult1 = _mm_set1_epi32(wrk->fx_scale); const __m128i rounder = _mm_set_epi32(0, ROUNDER, 0, ROUNDER); __m128i sum = zero; rescaler_t* frow = wrk->frow; const rescaler_t* const frow_end = wrk->frow + 4 * wrk->dst_width; if (wrk->num_channels != 4 || wrk->x_add > (x_sub << 7)) { WebPRescalerImportRowShrink_C(wrk, src); return; } assert(!WebPRescalerInputDone(wrk)); assert(!wrk->x_expand); for (; frow < frow_end; frow += 4) { __m128i base = zero; accum += wrk->x_add; while (accum > 0) { const __m128i A = _mm_cvtsi32_si128(WebPMemToUint32(src)); src += 4; base = _mm_unpacklo_epi8(A, zero); // To avoid overflow, we need: base * x_add / x_sub < 32768 // => x_add < x_sub << 7. That's a 1/128 reduction ratio limit. sum = _mm_add_epi16(sum, base); accum -= x_sub; } { // Emit next horizontal pixel. const __m128i mult = _mm_set1_epi16(-accum); const __m128i frac0 = _mm_mullo_epi16(base, mult); // 16b x 16b -> 32b const __m128i frac1 = _mm_mulhi_epu16(base, mult); const __m128i frac = _mm_unpacklo_epi16(frac0, frac1); // frac is 32b const __m128i A0 = _mm_mullo_epi16(sum, mult0); const __m128i A1 = _mm_mulhi_epu16(sum, mult0); const __m128i B0 = _mm_unpacklo_epi16(A0, A1); // sum * x_sub const __m128i frow_out = _mm_sub_epi32(B0, frac); // sum * x_sub - frac const __m128i D0 = _mm_srli_epi64(frac, 32); const __m128i D1 = _mm_mul_epu32(frac, mult1); // 32b x 16b -> 64b const __m128i D2 = _mm_mul_epu32(D0, mult1); const __m128i E1 = _mm_add_epi64(D1, rounder); const __m128i E2 = _mm_add_epi64(D2, rounder); const __m128i F1 = _mm_shuffle_epi32(E1, 1 | (3 << 2)); const __m128i F2 = _mm_shuffle_epi32(E2, 1 | (3 << 2)); const __m128i G = _mm_unpacklo_epi32(F1, F2); sum = _mm_packs_epi32(G, zero); _mm_storeu_si128((__m128i*)frow, frow_out); } } assert(accum == 0); }
static void MultARGBRow(uint32_t* const ptr, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 2; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set_epi16(0, 1 << 7, 1 << 7, 1 << 7, 0, 1 << 7, 1 << 7, 1 << 7); const __m128i kMult = _mm_set_epi16(0, 0x0101, 0x0101, 0x0101, 0, 0x0101, 0x0101, 0x0101); const __m128i kOne64 = _mm_set_epi16(1u << 8, 0, 0, 0, 1u << 8, 0, 0, 0); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i argb0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i argb1 = _mm_unpacklo_epi8(argb0, zero); const __m128i tmp0 = _mm_shufflelo_epi16(argb1, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp1 = _mm_shufflehi_epi16(tmp0, _MM_SHUFFLE(3, 3, 3, 3)); const __m128i tmp2 = _mm_srli_epi64(tmp1, 16); const __m128i scale0 = _mm_mullo_epi16(tmp1, kMult); const __m128i scale1 = _mm_or_si128(tmp2, kOne64); const __m128i argb2 = _mm_mulhi_epu16(argb1, scale0); const __m128i argb3 = _mm_mullo_epi16(argb1, scale1); const __m128i argb4 = _mm_adds_epu16(argb2, argb3); const __m128i argb5 = _mm_adds_epu16(argb4, kRound); const __m128i argb6 = _mm_srli_epi16(argb5, 8); const __m128i argb7 = _mm_packus_epi16(argb6, zero); _mm_storel_epi64((__m128i*)&ptr[x], argb7); } } width -= x; if (width > 0) WebPMultARGBRowC(ptr + x, width, inverse); }
static void MultRow(uint8_t* const ptr, const uint8_t* const alpha, int width, int inverse) { int x = 0; if (!inverse) { const int kSpan = 8; const __m128i zero = _mm_setzero_si128(); const __m128i kRound = _mm_set1_epi16(1 << 7); const int w2 = width & ~(kSpan - 1); for (x = 0; x < w2; x += kSpan) { const __m128i v0 = _mm_loadl_epi64((__m128i*)&ptr[x]); const __m128i v1 = _mm_unpacklo_epi8(v0, zero); const __m128i alpha0 = _mm_loadl_epi64((const __m128i*)&alpha[x]); const __m128i alpha1 = _mm_unpacklo_epi8(alpha0, zero); const __m128i alpha2 = _mm_unpacklo_epi8(alpha0, alpha0); const __m128i v2 = _mm_mulhi_epu16(v1, alpha2); const __m128i v3 = _mm_mullo_epi16(v1, alpha1); const __m128i v4 = _mm_adds_epu16(v2, v3); const __m128i v5 = _mm_adds_epu16(v4, kRound); const __m128i v6 = _mm_srli_epi16(v5, 8); const __m128i v7 = _mm_packus_epi16(v6, zero); _mm_storel_epi64((__m128i*)&ptr[x], v7); } } width -= x; if (width > 0) WebPMultRowC(ptr + x, alpha + x, width, inverse); }
__m128i test_mm_mulhi_epu16(__m128i A, __m128i B) { // DAG-LABEL: test_mm_mulhi_epu16 // DAG: call <8 x i16> @llvm.x86.sse2.pmulhu.w(<8 x i16> %{{.*}}, <8 x i16> %{{.*}}) // // ASM-LABEL: test_mm_mulhi_epu16 // ASM: pmulhuw return _mm_mulhi_epu16(A, B); }
void blend_sse2(const Uint8* alpha, const Uint32 size, const Uint8* source0, const Uint8* source1, Uint8* dest) { __m128i t0, t1, t2, t3, t4, t5, t6, t7, t8, t9, t10; Uint32 i; for (i = 0; i < (size / 4); i++) { t0 = _mm_load_si128((__m128i*)&source0[i * 16]); t1 = _mm_load_si128((__m128i*)&source1[i * 16]); t2 = (__m128i)_mm_load_ss((float*)&alpha[i * 4]); t2 = _mm_unpacklo_epi8(t2, t2); t2 = _mm_unpacklo_epi16(t2, t2); t3 = _mm_unpacklo_epi8(t0, t0); t4 = _mm_unpacklo_epi8(t1, t1); t5 = _mm_unpacklo_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t9 = _mm_adds_epu16(t7, t8); t9 = _mm_srli_epi16(t9, 8); t3 = _mm_unpackhi_epi8(t0, t0); t4 = _mm_unpackhi_epi8(t1, t1); t5 = _mm_unpackhi_epi32(t2, t2); t6 = _mm_sub_epi16(_mm_set1_epi8(0xFF), t5); t7 = _mm_mulhi_epu16(t3, t6); t8 = _mm_mulhi_epu16(t4, t5); t10 = _mm_adds_epu16(t7, t8); t10 = _mm_srli_epi16(t10, 8); t10 = _mm_packus_epi16(t9, t10); _mm_stream_si128((__m128i*)&dest[i * 16], t10); } }
__m64 _m_pmulhuw(__m64 _MM1, __m64 _MM2) { __m128i lhs = {0}, rhs = {0}; lhs.m128i_i64[0] = _MM1.m64_i64; rhs.m128i_i64[0] = _MM2.m64_i64; lhs = _mm_mulhi_epu16(lhs, rhs); _MM1.m64_i64 = lhs.m128i_i64[0]; return _MM1; }
// These constants are 14b fixed-point version of ITU-R BT.601 constants. // R = (19077 * y + 26149 * v - 14234) >> 6 // G = (19077 * y - 6419 * u - 13320 * v + 8708) >> 6 // B = (19077 * y + 33050 * u - 17685) >> 6 static void ConvertYUV444ToRGB_SSE41(const __m128i* const Y0, const __m128i* const U0, const __m128i* const V0, __m128i* const R, __m128i* const G, __m128i* const B) { const __m128i k19077 = _mm_set1_epi16(19077); const __m128i k26149 = _mm_set1_epi16(26149); const __m128i k14234 = _mm_set1_epi16(14234); // 33050 doesn't fit in a signed short: only use this with unsigned arithmetic const __m128i k33050 = _mm_set1_epi16((short)33050); const __m128i k17685 = _mm_set1_epi16(17685); const __m128i k6419 = _mm_set1_epi16(6419); const __m128i k13320 = _mm_set1_epi16(13320); const __m128i k8708 = _mm_set1_epi16(8708); const __m128i Y1 = _mm_mulhi_epu16(*Y0, k19077); const __m128i R0 = _mm_mulhi_epu16(*V0, k26149); const __m128i R1 = _mm_sub_epi16(Y1, k14234); const __m128i R2 = _mm_add_epi16(R1, R0); const __m128i G0 = _mm_mulhi_epu16(*U0, k6419); const __m128i G1 = _mm_mulhi_epu16(*V0, k13320); const __m128i G2 = _mm_add_epi16(Y1, k8708); const __m128i G3 = _mm_add_epi16(G0, G1); const __m128i G4 = _mm_sub_epi16(G2, G3); // be careful with the saturated *unsigned* arithmetic here! const __m128i B0 = _mm_mulhi_epu16(*U0, k33050); const __m128i B1 = _mm_adds_epu16(B0, Y1); const __m128i B2 = _mm_subs_epu16(B1, k17685); // use logical shift for B2, which can be larger than 32767 *R = _mm_srai_epi16(R2, 6); // range: [-14234, 30815] *G = _mm_srai_epi16(G4, 6); // range: [-10953, 27710] *B = _mm_srli_epi16(B2, 6); // range: [0, 34238] }
void unpack_rgba4_sse2(const Uint8* source, const Uint32 size, Uint8* dest) { __m128i t0, t1, t2; Uint32 i; for (i = 0; i < (size / 8); i++) { t0 = _mm_loadl_epi64((__m128i*)&source[i * 8]); // converts 4 bit values to 8 bit values (multiply with 17) t0 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_unpacklo_epi16(t0, t0); t1 = _mm_and_si128(t1, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00)); t1 = _mm_mullo_epi16(t1, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010)); t1 = _mm_mulhi_epu16(t1, _mm_set1_epi16(0x0110)); t2 = _mm_unpackhi_epi16(t0, t0); t2 = _mm_and_si128(t2, _mm_set_epi16(0xF000, 0x000F, 0x00F0, 0x0F00, 0xF000, 0x000F, 0x00F0, 0x0F00)); t2 = _mm_mullo_epi16(t2, _mm_set_epi16(0x0001, 0x1000, 0x0100, 0x0010, 0x0001, 0x1000, 0x0100, 0x0010)); t2 = _mm_mulhi_epu16(t2, _mm_set1_epi16(0x0110)); t1 = _mm_packus_epi16(t1, t2); _mm_stream_si128((__m128i*)&dest[i * 16], t1); } }
//lower - usually target //upper - usually source; its alpha decides how much of the lower color is visible //always does the full blending operation, does not optimize based on A=FF being true 90% of the time and A=00 90% of the remainder static inline uint32_t blend_8888_on_8888(uint32_t argb_lower, uint32_t argb_upper) { #ifdef __SSE2__ //no need to extend this above 128bit, it's complex enough without having to consider multiple pixels at once uint32_t spx = argb_upper; uint32_t tpx = argb_lower; //contains u16: spx.a, spx.b, spx.g, spx.r, tpx.{a,b,g,r} __m128i vals = _mm_unpacklo_epi8(_mm_set_epi32(0, 0, spx, tpx), _mm_setzero_si128()); //contains u16: {sa}*4, {255-sa}*4 __m128i alphas = _mm_xor_si128(_mm_set1_epi16(spx>>24), _mm_set_epi16(0,0,0,0, 255,255,255,255)); //contains u16: pixel contributions times 255 __m128i newcols255 = _mm_mullo_epi16(vals, alphas); //ugly magic constants: (u16)*8081>>16>>7 = (u16)/255 __m128i newcols = _mm_srli_epi16(_mm_mulhi_epu16(newcols255, _mm_set1_epi16(0x8081)), 7); //contains u8: {don't care}*8, sac (source alpha contribution), sbc, sgc, src, tac, tbc, tgc, trc __m128i newpack = _mm_packus_epi16(newcols, _mm_undefined_si128()); //contains u8: {don't care}*12, sac+tac = result alpha, sbc+tbc, sgc+tgc, src+trc //the components are known to not overflow __m128i newpacksum = _mm_add_epi8(newpack, _mm_srli_si128(newpack, 32/8)); return _mm_cvtsi128_si32(newpacksum); #else uint8_t sr = argb_upper>>0; uint8_t sg = argb_upper>>8; uint8_t sb = argb_upper>>16; uint8_t sa = argb_upper>>24; uint8_t tr = argb_lower>>0; uint8_t tg = argb_lower>>8; uint8_t tb = argb_lower>>16; uint8_t ta = argb_lower>>24; tr = (sr*sa/255) + (tr*(255-sa)/255); tg = (sg*sa/255) + (tg*(255-sa)/255); tb = (sb*sa/255) + (tb*(255-sa)/255); ta = (sa*sa/255) + (ta*(255-sa)/255); return ta<<24 | tb<<16 | tg<<8 | tr<<0; #endif }
// Average the value based on the number of values summed (9 for pixels away // from the border, 4 for pixels in corners, and 6 for other edge values). // // Add in the rounding factor and shift, clamp to 16, invert and shift. Multiply // by weight. static __m128i average_8(__m128i sum, const __m128i mul_constants, const int strength, const int rounding, const int weight) { // _mm_srl_epi16 uses the lower 64 bit value for the shift. const __m128i strength_u128 = _mm_set_epi32(0, 0, 0, strength); const __m128i rounding_u16 = _mm_set1_epi16(rounding); const __m128i weight_u16 = _mm_set1_epi16(weight); const __m128i sixteen = _mm_set1_epi16(16); // modifier * 3 / index; sum = _mm_mulhi_epu16(sum, mul_constants); sum = _mm_adds_epu16(sum, rounding_u16); sum = _mm_srl_epi16(sum, strength_u128); // The maximum input to this comparison is UINT16_MAX * NEIGHBOR_CONSTANT_4 // >> 16 (also NEIGHBOR_CONSTANT_4 -1) which is 49151 / 0xbfff / -16385 // So this needs to use the epu16 version which did not come until SSE4. sum = _mm_min_epu16(sum, sixteen); sum = _mm_sub_epi16(sixteen, sum); return _mm_mullo_epi16(sum, weight_u16); }
mlib_status mlib_VideoColorYUV2ARGB422_aligned( mlib_u8 *argb, const mlib_u8 *y, const mlib_u8 *u, const mlib_u8 *v, mlib_s32 width, mlib_s32 height, mlib_s32 argb_stride, mlib_s32 y_stride, mlib_s32 uv_stride) { /* 1.1644 * 8192 */ const __m128i c0 = _mm_set1_epi16(0x2543); const mlib_s32 ic0 = 0x2543; /* 2.0184 * 8192 */ const __m128i c1 = _mm_set1_epi16(0x4097); const mlib_s32 ic1 = 0x4097; /* abs( -0.3920 * 8192 ) */ const __m128i c4 = _mm_set1_epi16(0xc8b); const mlib_s32 ic4 = 0xc8b; /* abs( -0.8132 * 8192 ) */ const __m128i c5 = _mm_set1_epi16(0x1a06); const mlib_s32 ic5 = 0x1a06; /* 1.5966 * 8192 */ const __m128i c8 = _mm_set1_epi16(0x3317); const mlib_s32 ic8 = 0x3317; /* -276.9856 * 32 */ const __m128i coff0 = _mm_set1_epi16(0xdd60); const mlib_s32 icoff0 = (mlib_s32)0xffffdd60; /* 135.6352 * 32 */ const __m128i coff1 = _mm_set1_epi16(0x10f4); const mlib_s32 icoff1 = 0x10f4; /* -222.9952 * 32 */ const __m128i coff2 = _mm_set1_epi16(0xe420); const mlib_s32 icoff2 = (mlib_s32)0xffffe420; /* loop variable */ mlib_s32 jH, iW; /* pointers */ mlib_u8 *pY, *pU, *pV, *pD, *pdd, *ptemp; __m128i *py, *pu, *pv; /* variables */ __m128i sy1, sy2, sy3, sy4, su1, su2, sv1, sv2; __m128i du0, du1, dv1, dv2; __m128i db1, db2, db3, db4, dr1, dr2, dr3, dr4, dg1, dg2, dg3, dg4; __m128i ddy1, ddy2, ddy3, ddy4, dzrl, dzrh, dgbl, dgbh, drgbh, drgbl; __m128i db_h, db_l, dg_h, dg_l, dr_h, dr_l, temp, bak; const __m128i x_zero = _mm_setzero_si128(); const __m128i x_mask = _mm_set1_epi32(0xff); /* for 4-pixel computing */ mlib_s32 iu, iv, ig, ir, ib, iTemp; mlib_s32 iu0, iu1, iv1, iv2; pY = (mlib_u8 *)y; pU = (mlib_u8 *)u; pV = (mlib_u8 *)v; pD = (mlib_u8 *)argb; for (jH = 0; jH < height; jH++) { py = (__m128i *)pY; pu = (__m128i *)pU; pv = (__m128i *)pV; pdd = pD; iW = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ /* 32 pixels */ for (; iW <= width - 32; iW += 32) { /* load y u v, and expand */ temp = _mm_load_si128(pu); su1 = _mm_unpacklo_epi8(x_zero, temp); su2 = _mm_unpackhi_epi8(x_zero, temp); pu++; temp = _mm_load_si128(pv); sv1 = _mm_unpacklo_epi8(x_zero, temp); sv2 = _mm_unpackhi_epi8(x_zero, temp); pv++; temp = _mm_load_si128(py); sy1 = _mm_unpacklo_epi8(x_zero, temp); sy2 = _mm_unpackhi_epi8(x_zero, temp); py++; temp = _mm_load_si128(py); sy3 = _mm_unpacklo_epi8(x_zero, temp); sy4 = _mm_unpackhi_epi8(x_zero, temp); py++; /* pre-calc d[r/g/b][1234] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du0 = _mm_mulhi_epu16(su2, c1); db_h = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); du1 = _mm_mulhi_epu16(su2, c4); dv1 = _mm_mulhi_epu16(sv2, c5); temp = _mm_add_epi16(du1, dv1); dg_h = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); dv2 = _mm_mulhi_epu16(sv2, c8); dr_h = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); ddy2 = _mm_mulhi_epu16(sy2, c0); ddy3 = _mm_mulhi_epu16(sy3, c0); ddy4 = _mm_mulhi_epu16(sy4, c0); /* db1/2/3/4 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(db_l, db_l); db2 = _mm_add_epi16(ddy2, bak); bak = _mm_unpacklo_epi16(db_h, db_h); db3 = _mm_add_epi16(ddy3, bak); bak = _mm_unpackhi_epi16(db_h, db_h); db4 = _mm_add_epi16(ddy4, bak); /* dg1/2/3/4 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dg_l, dg_l); dg2 = _mm_add_epi16(ddy2, bak); bak = _mm_unpacklo_epi16(dg_h, dg_h); dg3 = _mm_add_epi16(ddy3, bak); bak = _mm_unpackhi_epi16(dg_h, dg_h); dg4 = _mm_add_epi16(ddy4, bak); /* dr1/2/3/4 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dr_l, dr_l); dr2 = _mm_add_epi16(ddy2, bak); bak = _mm_unpacklo_epi16(dr_h, dr_h); dr3 = _mm_add_epi16(ddy3, bak); bak = _mm_unpackhi_epi16(dr_h, dr_h); dr4 = _mm_add_epi16(ddy4, bak); db1 = _mm_srai_epi16(db1, 5); db2 = _mm_srai_epi16(db2, 5); db3 = _mm_srai_epi16(db3, 5); db4 = _mm_srai_epi16(db4, 5); dg1 = _mm_srai_epi16(dg1, 5); dg2 = _mm_srai_epi16(dg2, 5); dg3 = _mm_srai_epi16(dg3, 5); dg4 = _mm_srai_epi16(dg4, 5); dr1 = _mm_srai_epi16(dr1, 5); dr2 = _mm_srai_epi16(dr2, 5); dr3 = _mm_srai_epi16(dr3, 5); dr4 = _mm_srai_epi16(dr4, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, db2); db2 = _mm_packus_epi16(db3, db4); dr1 = _mm_packus_epi16(dr1, dr2); dr2 = _mm_packus_epi16(dr3, dr4); dg1 = _mm_packus_epi16(dg1, dg2); dg2 = _mm_packus_epi16(dg3, dg4); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dzrh = _mm_unpackhi_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); dgbh = _mm_unpackhi_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); drgbl = _mm_unpacklo_epi16(dzrh, dgbh); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrh, dgbh); SAVE_ARGB1(drgbh); /* create rgb sequences : db/dr/dg[2] */ dzrl = _mm_unpacklo_epi8(x_zero, dr2); dzrh = _mm_unpackhi_epi8(x_zero, dr2); dgbl = _mm_unpacklo_epi8(dg2, db2); dgbh = _mm_unpackhi_epi8(dg2, db2); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); drgbl = _mm_unpacklo_epi16(dzrh, dgbh); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrh, dgbh); SAVE_ARGB1(drgbh); } /* 16 pixels */ if (iW <= width - 16) { /* load y u v, and expand */ temp = _mm_loadl_epi64(pu); su1 = _mm_unpacklo_epi8(x_zero, temp); pu = (__m128i *) (((__m64 *)pu) + 1); temp = _mm_loadl_epi64(pv); sv1 = _mm_unpacklo_epi8(x_zero, temp); pv = (__m128i *) (((__m64 *)pv) + 1); temp = _mm_load_si128(py); sy1 = _mm_unpacklo_epi8(x_zero, temp); sy2 = _mm_unpackhi_epi8(x_zero, temp); py++; /* pre-calc d[r/g/b][12] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); ddy2 = _mm_mulhi_epu16(sy2, c0); /* db1/2 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(db_l, db_l); db2 = _mm_add_epi16(ddy2, bak); /* dg1/2 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dg_l, dg_l); dg2 = _mm_add_epi16(ddy2, bak); /* dr1/2 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); bak = _mm_unpackhi_epi16(dr_l, dr_l); dr2 = _mm_add_epi16(ddy2, bak); db1 = _mm_srai_epi16(db1, 5); db2 = _mm_srai_epi16(db2, 5); dg1 = _mm_srai_epi16(dg1, 5); dg2 = _mm_srai_epi16(dg2, 5); dr1 = _mm_srai_epi16(dr1, 5); dr2 = _mm_srai_epi16(dr2, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, db2); dr1 = _mm_packus_epi16(dr1, dr2); dg1 = _mm_packus_epi16(dg1, dg2); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dzrh = _mm_unpackhi_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); dgbh = _mm_unpackhi_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); drgbl = _mm_unpacklo_epi16(dzrh, dgbh); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrh, dgbh); SAVE_ARGB1(drgbh); iW += 16; } /* 8 pixels */ if (iW <= width - 8) { /* load y u v, and expand */ iTemp = *((mlib_s32 *)pu); temp = _mm_cvtsi32_si128(iTemp); su1 = _mm_unpacklo_epi8(x_zero, temp); pu = (__m128i *) (((mlib_s32 *)pu) + 1); iTemp = *((mlib_s32 *)pv); temp = _mm_cvtsi32_si128(iTemp); sv1 = _mm_unpacklo_epi8(x_zero, temp); pv = (__m128i *) (((mlib_s32 *)pv) + 1); temp = _mm_loadl_epi64(py); sy1 = _mm_unpacklo_epi8(x_zero, temp); py = (__m128i *) (((__m64 *)py) + 1); /* pre-calc d[r/g/b][1] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); /* db1 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); /* dg1 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); /* dr1 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); db1 = _mm_srai_epi16(db1, 5); dg1 = _mm_srai_epi16(dg1, 5); dr1 = _mm_srai_epi16(dr1, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, x_zero); dr1 = _mm_packus_epi16(dr1, x_zero); dg1 = _mm_packus_epi16(dg1, x_zero); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); drgbh = _mm_unpackhi_epi16(dzrl, dgbl); SAVE_ARGB1(drgbh); iW += 8; } /* 4 pixels */ if (iW <= width - 4) { /* load y u v, and expand */ iTemp = *((mlib_s16 *)pu); temp = _mm_cvtsi32_si128(iTemp); su1 = _mm_unpacklo_epi8(x_zero, temp); pu = (__m128i *) (((mlib_s16 *)pu) + 1); iTemp = *((mlib_s16 *)pv); temp = _mm_cvtsi32_si128(iTemp); sv1 = _mm_unpacklo_epi8(x_zero, temp); pv = (__m128i *) (((mlib_s16 *)pv) + 1); iTemp = *((mlib_s32 *)py); temp = _mm_cvtsi32_si128(iTemp); sy1 = _mm_unpacklo_epi8(x_zero, temp); py = (__m128i *) (((mlib_s32 *)py) + 1); /* pre-calc d[r/g/b][1] */ du0 = _mm_mulhi_epu16(su1, c1); db_l = _mm_add_epi16(du0, coff0); du1 = _mm_mulhi_epu16(su1, c4); dv1 = _mm_mulhi_epu16(sv1, c5); temp = _mm_add_epi16(du1, dv1); dg_l = _mm_sub_epi16(coff1, temp); dv2 = _mm_mulhi_epu16(sv1, c8); dr_l = _mm_add_epi16(dv2, coff2); ddy1 = _mm_mulhi_epu16(sy1, c0); /* db1 */ bak = _mm_unpacklo_epi16(db_l, db_l); db1 = _mm_add_epi16(ddy1, bak); /* dg1 */ bak = _mm_unpacklo_epi16(dg_l, dg_l); dg1 = _mm_add_epi16(ddy1, bak); /* dr1 */ bak = _mm_unpacklo_epi16(dr_l, dr_l); dr1 = _mm_add_epi16(ddy1, bak); db1 = _mm_srai_epi16(db1, 5); dg1 = _mm_srai_epi16(dg1, 5); dr1 = _mm_srai_epi16(dr1, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, x_zero); dr1 = _mm_packus_epi16(dr1, x_zero); dg1 = _mm_packus_epi16(dg1, x_zero); /* create rgb sequences : db/dr/dg[1] */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); drgbl = _mm_unpacklo_epi16(dzrl, dgbl); SAVE_ARGB1(drgbl); iW += 4; } /* 2 pixels */ if (iW <= width - 2) { /* load y u v, and expand */ iu = *((mlib_u8 *)pu); pu = (__m128i *) (((mlib_u8 *)pu) + 1); iv = *((mlib_u8 *)pv); pv = (__m128i *) (((mlib_u8 *)pv) + 1); iTemp = *((mlib_s16 *)py); temp = _mm_cvtsi32_si128(iTemp); sy1 = _mm_unpacklo_epi8(x_zero, temp); py = (__m128i *) (((mlib_s16 *)py) + 1); /* pre-calc d[r/g/b][1] */ iu0 = (iu * ic1) >> 8; ib = icoff0 + iu0; iu1 = (iu * ic4) >> 8; iv1 = (iv * ic5) >> 8; iTemp = iu1 + iv1; ig = icoff1 - iTemp; iv2 = (iv * ic8) >> 8; ir = iv2 + icoff2; ddy1 = _mm_mulhi_epu16(sy1, c0); /* db1 */ temp = _mm_set1_epi16(ib); db1 = _mm_add_epi16(ddy1, temp); /* dg1 */ temp = _mm_set1_epi16(ig); dg1 = _mm_add_epi16(ddy1, temp); /* dr1 */ temp = _mm_set1_epi16(ir); dr1 = _mm_add_epi16(ddy1, temp); db1 = _mm_srai_epi16(db1, 5); dg1 = _mm_srai_epi16(dg1, 5); dr1 = _mm_srai_epi16(dr1, 5); /* pack: 16=>8 */ db1 = _mm_packus_epi16(db1, x_zero); dr1 = _mm_packus_epi16(dr1, x_zero); dg1 = _mm_packus_epi16(dg1, x_zero); /* create rgb sequences : db/dr/dg */ dzrl = _mm_unpacklo_epi8(x_zero, dr1); dgbl = _mm_unpacklo_epi8(dg1, db1); /* lower half of drgl & dbzl */ drgbl = _mm_unpacklo_epi16(dzrl, dgbl); ptemp = (mlib_u8*)(&drgbl); pdd += 1; ptemp += 1; *((mlib_s16*)pdd) = *((mlib_s16*)ptemp); pdd += 2; ptemp += 2; *((mlib_u8*)pdd) = *((mlib_u8*)ptemp); pdd += 2; ptemp += 2; *((mlib_s16*)pdd) = *((mlib_s16*)ptemp); pdd += 2; ptemp += 2; *((mlib_u8*)pdd) = *((mlib_u8*)ptemp); pdd += 1; iW += 2; } pY += y_stride; pU += uv_stride; pV += uv_stride; pD += argb_stride; }
// Simple quantization static int QuantizeBlockSSE2(int16_t in[16], int16_t out[16], int n, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(2047); const __m128i zero = _mm_set1_epi16(0); __m128i sign0, sign8; __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i sharpen0 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[0]); const __m128i sharpen8 = _mm_loadu_si128((__m128i*)&mtx->sharpen_[8]); const __m128i iq0 = _mm_loadu_si128((__m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((__m128i*)&mtx->iq_[8]); const __m128i bias0 = _mm_loadu_si128((__m128i*)&mtx->bias_[0]); const __m128i bias8 = _mm_loadu_si128((__m128i*)&mtx->bias_[8]); const __m128i q0 = _mm_loadu_si128((__m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((__m128i*)&mtx->q_[8]); const __m128i zthresh0 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[0]); const __m128i zthresh8 = _mm_loadu_si128((__m128i*)&mtx->zthresh_[8]); // sign(in) = in >> 15 (0x0000 if positive, 0xffff if negative) sign0 = _mm_srai_epi16(in0, 15); sign8 = _mm_srai_epi16(in8, 15); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); // if (coeff > 2047) coeff = 2047 coeff0 = _mm_min_epi16(coeff0, max_coeff_2047); coeff8 = _mm_min_epi16(coeff8, max_coeff_2047); // out = (coeff * iQ + B) >> QFIX; { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // expand bias from 16b to 32b __m128i bias_00 = _mm_unpacklo_epi16(bias0, zero); __m128i bias_04 = _mm_unpackhi_epi16(bias0, zero); __m128i bias_08 = _mm_unpacklo_epi16(bias8, zero); __m128i bias_12 = _mm_unpackhi_epi16(bias8, zero); // out = (coeff * iQ + B) out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = (coeff * iQ + B) >> QFIX; out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); // if (coeff <= mtx->zthresh_) {in=0; out=0;} { __m128i cmp0 = _mm_cmpgt_epi16(coeff0, zthresh0); __m128i cmp8 = _mm_cmpgt_epi16(coeff8, zthresh8); in0 = _mm_and_si128(in0, cmp0); in8 = _mm_and_si128(in8, cmp8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); out0 = _mm_and_si128(out0, cmp0); out8 = _mm_and_si128(out8, cmp8); } // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not { int32_t tmp[4]; _mm_storeu_si128((__m128i*)tmp, packed_out); if (n) { tmp[0] &= ~0xff; } return (tmp[3] || tmp[2] || tmp[1] || tmp[0]); } }
void FileIconDrawGlass::Text(HDC hdc, PCTCHAR pcszText, const RECT &rc, eTextColor eColor, UINT uFlags) { if (!pcszText || !*pcszText) return; // Find out actual size of text int nChars = _tcslen(pcszText); uFlags |= DT_NOCLIP; int iX = rc.left; int iY = rc.top; int iXW = (rc.right - iX); int iYH = (rc.bottom - iY); RECT rcMin = rc; if (DrawText(hdcTextDIB, pcszText, nChars, &rcMin, uFlags | DT_CALCRECT)) { int iMinXW = rcMin.right - rcMin.left; int iMinYH = rcMin.bottom - rcMin.top; if (iMinXW < iXW) { if (uFlags & DT_CENTER) { iX += (iXW - iMinXW)/2; uFlags &= ~DT_CENTER; } else if (uFlags & DT_RIGHT) { iX += (iXW - iMinXW); uFlags &= ~DT_RIGHT; } iXW = iMinXW; } if (iMinYH < iYH) { if (uFlags & DT_SINGLELINE) { if (uFlags & DT_VCENTER) { iY += (iYH - iMinYH)/2; uFlags &= ~DT_VCENTER; } else if (uFlags & DT_BOTTOM) { iY += (iYH - iMinYH); uFlags &= ~DT_BOTTOM; } } iYH = iMinYH; } } iXW += 2; // NB: +2 'cause we want an extra pixel at the border so that the font smoothing will look bette! iYH += 2; // Ensure we have a big enough DIB to draw the text to if ((iXW > iTextDIBXW) || (iYH > iTextDIBYH)) CreateTextDIB(iXW, iYH); if (!hbmpTextDIB) return; // Select color ieBGRA clr; switch (eColor) { case eFileName: clr = clrFileName; break; case eComment: clr = clrComment; break; case eFileInfo: clr = clrFileInfo; break; default: clr = ieBGRA(0,0,0); break; } clr.A = 0xFF - clrBkg.A; // Draw the text to in-memory DIB RECT rcTextDIB = { 0, 0, iXW, iYH }; FillRect(hdcTextDIB, &rcTextDIB, hbrBkg); rcTextDIB.left++; rcTextDIB.top++; DrawText(hdcTextDIB, pcszText, nChars, &rcTextDIB, uFlags); // Modify DIB: #ifndef __X64__ if (g_bSSE2) #endif { __m128i r0, r1, r2, r3, r4, r5, r6, r7; r7 = _mm_setzero_si128(); // 0 r6 = _mm_set1_epi32(clr.dw); // CA CR CG CB CA CR CG CB CA CR CG CB CA CR CG CB r6 = _mm_unpacklo_epi8(r7, r6); // CA<<8 CR<<8 CG<<8 CB<<8 CA<<8 CR<<8 CG<<8 CB<<8 r5 = _mm_set1_epi16(1); // 1 1 1 1 1 1 1 1 r4 = _mm_set1_epi32(0xFF); // FF FF FF FF r3 = _mm_set1_epi32(clrBkg.dw); // DA 0 0 0 DA 0 0 0 DA 0 0 0 DA 0 0 0 ieBGRA *py = pTextDIB; for (int y = iYH; y--; py += iTextDIBXW) { ieBGRA *px = py; for (int x_4 = (iXW+3)>>2; x_4--; px += 4) { r0 = _mm_load_si128((__m128i *)px); r1 = r0; r2 = r0; // X3 R3 G3 B3 X2 R2 G2 B2 X1 R1 G1 B1 X0 R0 G0 B0 r0 = _mm_srli_epi32(r0, 16); // 0 0 X3 R3 0 0 X2 R2 0 0 X1 R1 0 0 X0 R0 r1 = _mm_srli_epi32(r1, 8); // 0 X3 R3 G3 0 X2 R2 G2 0 X1 R1 G1 0 X0 R0 G0 r0 = _mm_max_epu8(r0, r2); r0 = _mm_max_epu8(r0, r1); // x x x A3 x x x A2 x x x A1 x x x A0 r0 = _mm_and_si128(r0, r4); // 0 A3 0 A2 0 A1 0 A0 r0 = _mm_shufflelo_epi16(r0, _MM_SHUFFLE(2,2,0,0)); r0 = _mm_shufflehi_epi16(r0, _MM_SHUFFLE(2,2,0,0)); // A3 A3 A2 A2 A1 A1 A0 A0 r1 = r0; r0 = _mm_unpacklo_epi32(r0, r0); // A1 A1 A1 A1 A0 A0 A0 A0 r1 = _mm_unpackhi_epi32(r1, r1); // A3 A3 A3 A3 A2 A2 A2 A2 r0 = _mm_add_epi16(r0, r5); // A1' A1' A1' A1' A0' A0' A0' A0' r1 = _mm_add_epi16(r1, r5); // A3' A3' A3' A3' A2' A2' A2' A2' r0 = _mm_mulhi_epu16(r0, r6); // xA1" xR1 xG1 xB1 xA0" xR0 xG0 xB0 r1 = _mm_mulhi_epu16(r1, r6); // xA3" xR3 xG3 xB3 xA2" xR2 xG2 xB2 r0 = _mm_packus_epi16(r0, r1); // xA3"xR3 xG3 xB3 xA2"xR2 xG2 xB2 xA1"xR1 xG1 xB1 xA0"xR0 xG0 xB0 r0 = _mm_adds_epu8(r0, r3); // xA3 xR3 xG3 xB3 xA2 xR2 xG2 xB2 xA1 xR1 xG1 xB1 xA0 xR0 xG0 xB0 _mm_store_si128((__m128i *)px, r0); } } } #ifndef __X64__ else {
SIMD_INLINE __m128i ShiftedWeightedSquare16(__m128i difference, __m128i weight) { return _mm_mulhi_epu16(_mm_mullo_epi16(difference, difference), weight); }
static void GF_FUNC_ALIGN VS_CC proc_16bit_sse2(convolution_hv_t *ch, uint8_t *buff, int bstride, int width, int height, int stride, uint8_t *d, const uint8_t *s) { const uint16_t *srcp = (uint16_t *)s; uint16_t *dstp = (uint16_t *)d; stride /= 2; bstride /= 2; uint16_t *p0 = (uint16_t *)buff + 8; uint16_t *p1 = p0 + bstride; uint16_t *p2 = p1 + bstride; uint16_t *p3 = p2 + bstride; uint16_t *p4 = p3 + bstride; uint16_t *orig = p0, *end = p4; line_copy16(p0, srcp + 2 * stride, width, 2); line_copy16(p1, srcp + stride, width, 2); line_copy16(p2, srcp, width, 2); srcp += stride; line_copy16(p3, srcp, width, 2); __m128i zero = _mm_setzero_si128(); __m128i all1 = _mm_cmpeq_epi32(zero, zero); __m128i one = _mm_srli_epi32(all1, 31); __m128 rdiv_h = _mm_set1_ps((float)ch->rdiv_h); __m128 rdiv_v = _mm_set1_ps((float)ch->rdiv_v); __m128 bias = _mm_set1_ps((float)ch->bias); __m128i matrix_h[5]; __m128i matrix_v[5]; int sign_h[5]; int sign_v[5]; for (int i = 0; i < 5; i++) { sign_h[i] = ch->m_h[i] < 0 ? 1 : 0; sign_v[i] = ch->m_v[i] < 0 ? 1 : 0; uint16_t val = sign_h[i] ? (uint16_t)(ch->m_h[i] * -1) : (uint16_t)ch->m_h[i]; matrix_h[i] = _mm_set1_epi16((int16_t)val); val = sign_v[i] ? (uint16_t)(ch->m_v[i] * -1) : (uint16_t)ch->m_v[i]; matrix_v[i] = _mm_set1_epi16((int16_t)val); } for (int y = 0; y < height; y++) { srcp += stride * (y < height - 2 ? 1 : -1); line_copy16(p4, srcp, width, 2); for (int x = 0; x < width; x += 8) { uint16_t *array[] = { p0 + x, p1 + x, p2 + x, p3 + x, p4 + x, p2 + x - 2, p2 + x - 1, dstp + x, p2 + x + 1, p2 + x + 2 }; for (int j = 0; j < 2; j++) { __m128i *matrix = j == 0 ? matrix_v : matrix_h; int *sign = j == 0 ? sign_v : sign_h; __m128 rdiv = j == 0 ? rdiv_v : rdiv_h; __m128i sum[2]; sum[0] = _mm_setzero_si128(); sum[1] = _mm_setzero_si128(); for (int i = 0; i < 5; i++) { __m128i xmm0, xmm1, xmm2; xmm0 = _mm_loadu_si128((__m128i *)array[i + j * 5]); xmm1 = _mm_mullo_epi16(xmm0, matrix[i]); xmm0 = _mm_mulhi_epu16(xmm0, matrix[i]); xmm2 = _mm_unpacklo_epi16(xmm1, xmm0); xmm0 = _mm_unpackhi_epi16(xmm1, xmm0); if (sign[i]) { xmm2 = _mm_add_epi32(one, _mm_xor_si128(xmm2, all1)); xmm0 = _mm_add_epi32(one, _mm_xor_si128(xmm0, all1)); } sum[0] = _mm_add_epi32(sum[0], xmm2); sum[1] = _mm_add_epi32(sum[1], xmm0); } for (int i = 0; i < 2; i++) { __m128 sumfp; __m128i mask, temp; sumfp = _mm_cvtepi32_ps(sum[i]); sumfp = _mm_mul_ps(sumfp, rdiv); if (j == 1) { sumfp = _mm_add_ps(sumfp, bias); } sum[i] = _mm_cvttps_epi32(sumfp); temp = _mm_srli_epi32(all1, 16); mask = _mm_cmplt_epi32(sum[i], temp); sum[i] = _mm_or_si128(_mm_and_si128(sum[i], mask), _mm_andnot_si128(mask, temp)); mask = _mm_cmpgt_epi32(sum[i], zero); if (ch->saturate) { sum[i] = _mm_and_si128(mask, sum[i]); } else { temp = _mm_add_epi32(one, _mm_xor_si128(sum[i], all1)); sum[i] = _mm_or_si128(_mm_and_si128(mask, sum[i]), _mm_andnot_si128(mask, temp)); } } sum[0] = mm_cast_epi32(sum[0], sum[1]); _mm_store_si128((__m128i *)(dstp + x), sum[0]); } } dstp += stride; p0 = p1; p1 = p2; p2 = p3; p3 = p4; p4 = (p4 == end) ? orig : p4 + bstride; } }
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // coeff = abs(in) __m128i coeff0 = _mm_abs_epi16(in0); __m128i coeff8 = _mm_abs_epi16(in8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // put sign back out0 = _mm_sign_epi16(out0, in0); out8 = _mm_sign_epi16(out8, in8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. The re-ordering is: // 0 1 2 3 4 5 6 7 | 8 9 10 11 12 13 14 15 // -> 0 1 4[8]5 2 3 6 | 9 12 13 10 [7]11 14 15 // There's only two misplaced entries ([8] and [7]) that are crossing the // reg's boundaries. // We use pshufb instead of pshuflo/pshufhi. { const __m128i kCst_lo = PSHUFB_CST(0, 1, 4, -1, 5, 2, 3, 6); const __m128i kCst_7 = PSHUFB_CST(-1, -1, -1, -1, 7, -1, -1, -1); const __m128i tmp_lo = _mm_shuffle_epi8(out0, kCst_lo); const __m128i tmp_7 = _mm_shuffle_epi8(out0, kCst_7); // extract #7 const __m128i kCst_hi = PSHUFB_CST(1, 4, 5, 2, -1, 3, 6, 7); const __m128i kCst_8 = PSHUFB_CST(-1, -1, -1, 0, -1, -1, -1, -1); const __m128i tmp_hi = _mm_shuffle_epi8(out8, kCst_hi); const __m128i tmp_8 = _mm_shuffle_epi8(out8, kCst_8); // extract #8 const __m128i out_z0 = _mm_or_si128(tmp_lo, tmp_8); const __m128i out_z8 = _mm_or_si128(tmp_hi, tmp_7); _mm_storeu_si128((__m128i*)&out[0], out_z0); _mm_storeu_si128((__m128i*)&out[8], out_z8); packed_out = _mm_packs_epi16(out_z0, out_z8); } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
static WEBP_INLINE int DoQuantizeBlock(int16_t in[16], int16_t out[16], const uint16_t* const sharpen, const VP8Matrix* const mtx) { const __m128i max_coeff_2047 = _mm_set1_epi16(MAX_LEVEL); const __m128i zero = _mm_setzero_si128(); __m128i coeff0, coeff8; __m128i out0, out8; __m128i packed_out; // Load all inputs. // TODO(cduvivier): Make variable declarations and allocations aligned so that // we can use _mm_load_si128 instead of _mm_loadu_si128. __m128i in0 = _mm_loadu_si128((__m128i*)&in[0]); __m128i in8 = _mm_loadu_si128((__m128i*)&in[8]); const __m128i iq0 = _mm_loadu_si128((const __m128i*)&mtx->iq_[0]); const __m128i iq8 = _mm_loadu_si128((const __m128i*)&mtx->iq_[8]); const __m128i q0 = _mm_loadu_si128((const __m128i*)&mtx->q_[0]); const __m128i q8 = _mm_loadu_si128((const __m128i*)&mtx->q_[8]); // extract sign(in) (0x0000 if positive, 0xffff if negative) const __m128i sign0 = _mm_cmpgt_epi16(zero, in0); const __m128i sign8 = _mm_cmpgt_epi16(zero, in8); // coeff = abs(in) = (in ^ sign) - sign coeff0 = _mm_xor_si128(in0, sign0); coeff8 = _mm_xor_si128(in8, sign8); coeff0 = _mm_sub_epi16(coeff0, sign0); coeff8 = _mm_sub_epi16(coeff8, sign8); // coeff = abs(in) + sharpen if (sharpen != NULL) { const __m128i sharpen0 = _mm_loadu_si128((const __m128i*)&sharpen[0]); const __m128i sharpen8 = _mm_loadu_si128((const __m128i*)&sharpen[8]); coeff0 = _mm_add_epi16(coeff0, sharpen0); coeff8 = _mm_add_epi16(coeff8, sharpen8); } // out = (coeff * iQ + B) >> QFIX { // doing calculations with 32b precision (QFIX=17) // out = (coeff * iQ) const __m128i coeff_iQ0H = _mm_mulhi_epu16(coeff0, iq0); const __m128i coeff_iQ0L = _mm_mullo_epi16(coeff0, iq0); const __m128i coeff_iQ8H = _mm_mulhi_epu16(coeff8, iq8); const __m128i coeff_iQ8L = _mm_mullo_epi16(coeff8, iq8); __m128i out_00 = _mm_unpacklo_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_04 = _mm_unpackhi_epi16(coeff_iQ0L, coeff_iQ0H); __m128i out_08 = _mm_unpacklo_epi16(coeff_iQ8L, coeff_iQ8H); __m128i out_12 = _mm_unpackhi_epi16(coeff_iQ8L, coeff_iQ8H); // out = (coeff * iQ + B) const __m128i bias_00 = _mm_loadu_si128((const __m128i*)&mtx->bias_[0]); const __m128i bias_04 = _mm_loadu_si128((const __m128i*)&mtx->bias_[4]); const __m128i bias_08 = _mm_loadu_si128((const __m128i*)&mtx->bias_[8]); const __m128i bias_12 = _mm_loadu_si128((const __m128i*)&mtx->bias_[12]); out_00 = _mm_add_epi32(out_00, bias_00); out_04 = _mm_add_epi32(out_04, bias_04); out_08 = _mm_add_epi32(out_08, bias_08); out_12 = _mm_add_epi32(out_12, bias_12); // out = QUANTDIV(coeff, iQ, B, QFIX) out_00 = _mm_srai_epi32(out_00, QFIX); out_04 = _mm_srai_epi32(out_04, QFIX); out_08 = _mm_srai_epi32(out_08, QFIX); out_12 = _mm_srai_epi32(out_12, QFIX); // pack result as 16b out0 = _mm_packs_epi32(out_00, out_04); out8 = _mm_packs_epi32(out_08, out_12); // if (coeff > 2047) coeff = 2047 out0 = _mm_min_epi16(out0, max_coeff_2047); out8 = _mm_min_epi16(out8, max_coeff_2047); } // get sign back (if (sign[j]) out_n = -out_n) out0 = _mm_xor_si128(out0, sign0); out8 = _mm_xor_si128(out8, sign8); out0 = _mm_sub_epi16(out0, sign0); out8 = _mm_sub_epi16(out8, sign8); // in = out * Q in0 = _mm_mullo_epi16(out0, q0); in8 = _mm_mullo_epi16(out8, q8); _mm_storeu_si128((__m128i*)&in[0], in0); _mm_storeu_si128((__m128i*)&in[8], in8); // zigzag the output before storing it. // // The zigzag pattern can almost be reproduced with a small sequence of // shuffles. After it, we only need to swap the 7th (ending up in third // position instead of twelfth) and 8th values. { __m128i outZ0, outZ8; outZ0 = _mm_shufflehi_epi16(out0, _MM_SHUFFLE(2, 1, 3, 0)); outZ0 = _mm_shuffle_epi32 (outZ0, _MM_SHUFFLE(3, 1, 2, 0)); outZ0 = _mm_shufflehi_epi16(outZ0, _MM_SHUFFLE(3, 1, 0, 2)); outZ8 = _mm_shufflelo_epi16(out8, _MM_SHUFFLE(3, 0, 2, 1)); outZ8 = _mm_shuffle_epi32 (outZ8, _MM_SHUFFLE(3, 1, 2, 0)); outZ8 = _mm_shufflelo_epi16(outZ8, _MM_SHUFFLE(1, 3, 2, 0)); _mm_storeu_si128((__m128i*)&out[0], outZ0); _mm_storeu_si128((__m128i*)&out[8], outZ8); packed_out = _mm_packs_epi16(outZ0, outZ8); } { const int16_t outZ_12 = out[12]; const int16_t outZ_3 = out[3]; out[3] = outZ_12; out[12] = outZ_3; } // detect if all 'out' values are zeroes or not return (_mm_movemask_epi8(_mm_cmpeq_epi8(packed_out, zero)) != 0xffff); }
void SWModelRenderer::RenderInner(spades::draw::SWModel *model, const client::ModelRenderParam ¶m) { auto& mat = param.matrix; auto origin = mat.GetOrigin(); auto axis1 = mat.GetAxis(0); auto axis2 = mat.GetAxis(1); auto axis3 = mat.GetAxis(2); auto *rawModel = model->GetRawModel(); auto rawModelOrigin = rawModel->GetOrigin(); rawModelOrigin += 0.1f; origin += axis1 * rawModelOrigin.x; origin += axis2 * rawModelOrigin.y; origin += axis3 * rawModelOrigin.z; int w = rawModel->GetWidth(); int h = rawModel->GetHeight(); //int d = rawModel->GetDepth(); // evaluate brightness for each normals uint8_t brights[3*3*3]; { auto lightVec = MakeVector3(0.f, -0.707f, -0.707f); float dot1 = Vector3::Dot(axis1, lightVec) * fastRSqrt(axis1.GetPoweredLength()); float dot2 = Vector3::Dot(axis2, lightVec) * fastRSqrt(axis2.GetPoweredLength()); float dot3 = Vector3::Dot(axis3, lightVec) * fastRSqrt(axis3.GetPoweredLength()); for(int x = 0; x < 3; x++){ float d; int cnt; switch(x){ case 0: d = -dot1; cnt = 1; break; case 1: d = 0.f; cnt = 0; break; case 2: d = dot1; cnt = 1; break; } for(int y = 0; y < 3; y++){ auto d2 = d; auto cnt2 = cnt; switch(y){ case 0: d2 -= dot2; cnt2++; break; case 1: break; case 2: d2 += dot2; cnt2++; break; } for(int z = 0; z < 3; z++) { auto d3 = d; auto cnt3 = cnt2; switch(y){ case 0: d3 -= dot3; cnt3++; break; case 1: break; case 2: d3 += dot3; cnt3++; break; } switch(cnt3){ case 2: d3 *= 0.707f; break; case 3: d3 *= 0.57735f; break; } d3 = 192.f + d3 * 62.f; brights[x + y * 3 + z * 9] = static_cast<uint8_t>(d3); } } } } // compute center coord. for culling { auto center = origin; auto localCenter = model->GetCenter(); center += axis1 * localCenter.x; center += axis2 * localCenter.y; center += axis3 * localCenter.z; float largestAxis = axis1.GetPoweredLength(); largestAxis = std::max(largestAxis, axis2.GetPoweredLength()); largestAxis = std::max(largestAxis, axis3.GetPoweredLength()); if(!r->SphereFrustrumCull(center, model->GetRadius() * sqrtf(largestAxis))) return; } Bitmap *fbmp = r->fb; auto *fb = fbmp->GetPixels(); int fw = fbmp->GetWidth(); int fh = fbmp->GetHeight(); auto *db = r->depthBuffer.data(); Matrix4 viewproj = r->GetProjectionViewMatrix(); Vector4 ndc2scrscale = {fw * 0.5f, -fh * 0.5f, 1.f, 1.f}; //Vector4 ndc2scroff = {fw * 0.5f, fh * 0.5f, 0.f, 0.f}; int ndc2scroffX = fw >> 1; int ndc2scroffY = fh >> 1; // render each points auto tOrigin = viewproj * MakeVector4(origin.x, origin.y, origin.z, 1.f); auto tAxis1 = viewproj * MakeVector4(axis1.x, axis1.y, axis1.z, 0.f); auto tAxis2 = viewproj * MakeVector4(axis2.x, axis2.y, axis2.z, 0.f); auto tAxis3 = viewproj * MakeVector4(axis3.x, axis3.y, axis3.z, 0.f); tOrigin *= ndc2scrscale; tAxis1 *= ndc2scrscale; tAxis2 *= ndc2scrscale; tAxis3 *= ndc2scrscale; float pointDiameter;// = largestAxis * 0.55f * fh * 0.5f; { float largestAxis = tAxis1.GetPoweredLength(); largestAxis = std::max(largestAxis, tAxis2.GetPoweredLength()); largestAxis = std::max(largestAxis, tAxis3.GetPoweredLength()); pointDiameter = sqrtf(largestAxis); } uint32_t customColor; customColor = ToFixed8(param.customColor.z) | (ToFixed8(param.customColor.y) << 8) | (ToFixed8(param.customColor.x) << 16); auto v1 = tOrigin; float zNear = r->sceneDef.zNear; for(int x = 0; x < w; x++) { auto v2 = v1; for(int y = 0; y < h; y++) { auto *mp = &model->renderData [model->renderDataAddr[x + y * w]]; while(*mp != -1) { uint32_t data = *(mp++); uint32_t normal = *(mp++); int z = static_cast<int>(data >> 24); //SPAssert(z < d); SPAssert(z >= 0); auto vv = v2 + tAxis3 * zvals[z]; if(vv.z < zNear) continue; // save Z value (don't divide this by W!) float zval = vv.z; // use vv.z for point radius to be divided by W vv.z = pointDiameter; // perspective division float scl = fastRcp(vv.w); vv *= scl; int ix = static_cast<int>(vv.x) + ndc2scroffX; int iy = static_cast<int>(vv.y) + ndc2scroffY; int idm = static_cast<int>(vv.z + .99f); idm = std::max(1, idm); int minX = ix - (idm >> 1); int minY = iy - (idm >> 1); if(minX >= fw || minY >= fh) continue; int maxX = ix + idm; int maxY = iy + idm; if(maxX <= 0 || maxY <= 0) continue; minX = std::max(minX, 0); minY = std::max(minY, 0); maxX = std::min(maxX, fw); maxY = std::min(maxY, fh); auto *fb2 = fb + (minX + minY * fw); auto *db2 = db + (minX + minY * fw); int w = maxX - minX; uint32_t color = data & 0xffffff; if(color == 0) color = customColor; SPAssert(normal < 27); int bright = brights[normal]; #if ENABLE_SSE2 if(lvl == SWFeatureLevel::SSE2) { auto m = _mm_setr_epi32(color, 0, 0, 0); auto f = _mm_set1_epi16(bright << 8); m = _mm_unpacklo_epi8(m, _mm_setzero_si128()); m = _mm_mulhi_epu16(m, f); m = _mm_packus_epi16(m, m); _mm_store_ss(reinterpret_cast<float*>(&color), _mm_castsi128_ps(m)); }else #endif { uint32_t c1 = color & 0xff00; uint32_t c2 = color & 0xff00ff; c1 *= bright; c2 *= bright; color = ((c1&0xff0000) | (c2&0xff00ff00)) >> 8; } for(int yy = minY; yy < maxY; yy++){ auto *fb3 = fb2; auto *db3 = db2; for(int xx = w; xx > 0; xx--) { if(zval < *db3) { *db3 = zval; *fb3 = color; } fb3++; db3++; } fb2 += fw; db2 += fw; } } v2 += tAxis2; } v1 += tAxis1; } }
void mlib_s_ImageBlendLine( mlib_work_image * param, mlib_u8 *dp, __m128i * buffz, __m128i * buffd) { mlib_blend blend = param->blend; mlib_s32 chan_d = param->chan_d; mlib_s32 chan_s = param->channels; mlib_d64 alp = (param->alpha) * (1.0 / 255); mlib_s32 width = GetElemSubStruct(current, width); mlib_u8 *tdp = dp; mlib_s32 width2, y_step, next_step = 2; mlib_s32 alp_ind = param->alp_ind, mask255; __m128i aa, dalp, done; __m128i mzero, mask_7fff, mask_8000, amask, amask256, amaskffff; __m128i d_rnd; mlib_s32 i, j; if (!alp_ind) { d_rnd = _mm_set1_epi16(0x0080); tdp = (void *)dp; if (chan_d == 3) tdp = (void *)buffd; for (i = 0; i < width / 2; i++) { __m128i dd; dd = buffz[i]; dd = _mm_adds_epu16(dd, d_rnd); dd = _mm_srli_epi16(dd, 8); dd = _mm_packus_epi16(dd, dd); _mm_storel_epi64((void *)(tdp + 8 * i), dd); } if (width & 1) { __m128i dd; dd = buffz[i]; dd = _mm_adds_epu16(dd, d_rnd); dd = _mm_srli_epi16(dd, 8); dd = _mm_packus_epi16(dd, dd); *(mlib_s32 *)(tdp + 8 * i) = *(mlib_s32 *)ⅆ } if (chan_d == 3) { mlib_s_ImageChannelExtract_U8_43L_D1((void *)buffd, dp, width); } return; } width2 = (width + 1) / 2; mzero = _mm_setzero_si128(); mask_7fff = _mm_set1_epi16(0x7FFF); mask_8000 = _mm_set1_epi16(0x8000); done = _mm_set1_epi16(1 << 15); if (alp_ind == -1) { mask255 = 0xFF; amask = _mm_setr_epi32(0xff00, 0, 0xff00, 0); amaskffff = _mm_setr_epi32(0xffff, 0, 0xffff, 0); amask256 = _mm_setr_epi32(0x0100, 0, 0x0100, 0); } else { mask255 = 0xFF000000; amask = _mm_setr_epi32(0, 0xff000000, 0, 0xff000000); amaskffff = _mm_setr_epi32(0, 0xffff0000, 0, 0xffff0000); amask256 = _mm_setr_epi32(0, 0x01000000, 0, 0x01000000); } dalp = _mm_set1_epi16((1 << 15) * alp + 0.5); if (chan_s == 3) { if (chan_d == 3) { mlib_d64 alp = (param->alpha) * (1.0 / 255); mlib_s32 ialp; mlib_u8 *pz; __m128i emask; __m128i dalp, ralp, ss, dd, s0, s1, d0, d1, dr; mlib_s_ImageChannelExtract_S16_43L_D1((void *)buffz, (void *)buffd, width); ialp = alp * (1 << 15); dalp = _mm_set1_epi16(ialp); ralp = _mm_set1_epi16((1 << 15) - ialp); emask = mlib_emask_m128i[(3 * width) & 15].m128i; pz = (void *)buffd; tdp = dp; for (i = 0; i <= 3 * width - 16; i += 16) { s0 = _mm_load_si128((__m128i *) (pz + 2 * i)); s1 = _mm_load_si128((__m128i *) (pz + 2 * i + 16)); dd = _mm_loadu_si128((__m128i *) (tdp + i)); d0 = _mm_unpacklo_epi8(mzero, dd); d1 = _mm_unpackhi_epi8(mzero, dd); d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp), _mm_mulhi_epu16(d0, ralp)); d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp), _mm_mulhi_epu16(d1, ralp)); d0 = _mm_srli_epi16(d0, 7); d1 = _mm_srli_epi16(d1, 7); dr = _mm_packus_epi16(d0, d1); _mm_storeu_si128((__m128i *) (tdp + i), dr); } if (i < 3 * width) { s0 = _mm_load_si128((__m128i *) (pz + 2 * i)); s1 = _mm_load_si128((__m128i *) (pz + 2 * i + 16)); dd = _mm_loadu_si128((__m128i *) (tdp + i)); d0 = _mm_unpacklo_epi8(mzero, dd); d1 = _mm_unpackhi_epi8(mzero, dd); d0 = _mm_add_epi16(_mm_mulhi_epu16(s0, dalp), _mm_mulhi_epu16(d0, ralp)); d1 = _mm_add_epi16(_mm_mulhi_epu16(s1, dalp), _mm_mulhi_epu16(d1, ralp)); d0 = _mm_srli_epi16(d0, 7); d1 = _mm_srli_epi16(d1, 7); dr = _mm_packus_epi16(d0, d1); dr = _mm_or_si128(_mm_and_si128(emask, dr), _mm_andnot_si128(emask, dd)); _mm_storeu_si128((__m128i *) (tdp + i), dr); } } else if (blend == MLIB_BLEND_GTK_SRC) { mlib_u8 *buffi = (mlib_u8 *)buffz + 1; for (i = 0; i < width; i++) { tdp[0] = buffi[0]; tdp[1] = buffi[2]; tdp[2] = buffi[4]; tdp[alp_ind] = 255; tdp += 4; buffi += 8; } } else { mlib_d64 _w0 = param->alpha; mlib_d64 _w1s = 1.0 - _w0 * (1.0 / 255); __m128i buff[1]; __m128i done; __m128i dalp, ralp, ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr; __m128i wi, aa, amask; __m128 af, w0, w1, w1s, w, rw, w0r, w1r, scale; done = _mm_set1_epi16(1 << 15); amask = _mm_set1_epi32(mask255); w0 = _mm_set_ps1(_w0); w1s = _mm_set_ps1(_w1s); scale = _mm_set_ps1(1 << 15); if (alp_ind == -1) { tdp--; for (i = 0; i < width / 4; i++) { BLEND34_SRC_OVER(0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND34_SRC_OVER(0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND34_SRC_OVER(3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND34_SRC_OVER(3); buff[0] = dr; } } for (i = 0; i < (width & 3); i++) { ((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i]; } } } else if (chan_d == 3) { if (blend != MLIB_BLEND_GTK_SRC) { if (alp_ind == -1) { tdp--; } for (i = 0; i < width; i++) { ((mlib_s32 *)buffd)[i] = *(mlib_s32 *)(tdp + 3 * i); } if (alp_ind == -1) { for (i = 0; i < width2; i++) { __m128i a0, s0, d0, dd; BLEND43_SRC_OVER(0); } mlib_s_ImageChannelExtract_U8_43R_D1((void *) buffd, dp, width); } else { for (i = 0; i < width2; i++) { __m128i a0, s0, d0, dd; BLEND43_SRC_OVER(0xff); } mlib_s_ImageChannelExtract_U8_43L_D1((void *) buffd, dp, width); } } else { mlib_u8 *buffi = (mlib_u8 *)buffz + 1; if (alp_ind == -1) buffi += 2; for (i = 0; i < width; i++) { tdp[0] = buffi[0]; tdp[1] = buffi[2]; tdp[2] = buffi[4]; tdp += 3; buffi += 8; } } } else { /* if (chan_d == 4) */ if (alp_ind == -1) { tdp--; } if (blend == MLIB_BLEND_GTK_SRC) { mlib_u8 *p_alp = (mlib_u8 *)buffz + 1; mlib_s32 tail = ((mlib_s32 *)tdp)[width]; if (alp_ind != -1) p_alp += 6; for (i = 0; i < width2; i++) { __m128i a0, a1, aa, ss, d0, dd; ss = buffz[i]; a0 = _mm_loadl_epi64((void *)((mlib_d64 *) mlib_m_tbl_255DivAlpha + p_alp[0])); a1 = _mm_loadl_epi64((void *)((mlib_d64 *) mlib_m_tbl_255DivAlpha + p_alp[8])); aa = _mm_unpacklo_epi64(a0, a1); aa = _mm_or_si128(amask256, _mm_andnot_si128(amaskffff, aa)); d0 = _mm_mulhi_epu16(ss, aa); dd = _mm_packus_epi16(d0, d0); _mm_storel_epi64((void *)(tdp + 8 * i), dd); p_alp += 16; } ((mlib_s32 *)tdp)[width] = tail; } else { mlib_blend blend = param->blend; mlib_d64 alp = (param->alpha) * (1.0 / 255); __m128i buff[1]; __m128i done; __m128i ss, dd, s0, s1, d0, d1, a0, a1, r0, r1, rr, dr; __m128i wi, aa, amask, a16mask, zero_mask_i; __m128 dalp, div255, alpha, fone; __m128 af, sf, w0, w1, w1s, w, rw, w0r, w1r, scale; __m128 zero_mask, f_rnd; mlib_m128 s0u, s1u, s2u, s3u; done = _mm_set1_epi16(1 << 14); amask = _mm_set1_epi32(mask255); a16mask = _mm_set1_epi32(0xFFFF); dalp = _mm_set_ps1(alp * (1.0 / 256)); fone = _mm_set_ps1(1.0); div255 = _mm_set_ps1(1.0 / 255); scale = _mm_set_ps1(1 << 8); alpha = _mm_set_ps1((float)(param->alpha) + 0.5); f_rnd = _mm_set_ps1(0.6); if (blend == MLIB_BLEND_GTK_SRC_OVER2) { if (alp_ind == -1) { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER2, 0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER2, 0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER2, 3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER2, 3); buff[0] = dr; } } } else { if (alp_ind == -1) { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER, 0); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER, 0); buff[0] = dr; } } else { for (i = 0; i < width / 4; i++) { BLEND44(SRC_OVER, 3); _mm_storeu_si128((__m128i *) tdp, dr); tdp += 16; } if (width & 3) { BLEND44(SRC_OVER, 3); buff[0] = dr; } } } for (i = 0; i < (width & 3); i++) { ((mlib_s32 *)tdp)[i] = ((mlib_s32 *)buff)[i]; } } } }
test (__m128i s1, __m128i s2) { return _mm_mulhi_epu16 (s1, s2); }
mlib_status mlib_VideoColorJFIFYCC2ABGR444_naligned( mlib_u8 *abgr, const mlib_u8 *y, const mlib_u8 *cb, const mlib_u8 *cr, mlib_s32 n) { /* 1.402 * 8192 */ const __m128i x_c13 = _mm_set1_epi16(0x2cdd); const mlib_s32 c13 = 0x2cdd; /* abs(-0.34414) * 8192 */ const __m128i x_c22 = _mm_set1_epi16(0xb03); const mlib_s32 c22 = 0xb03; /* abs(-0.71414) * 8192 */ const __m128i x_c23 = _mm_set1_epi16(0x16da); const mlib_s32 c23 = 0x16da; /* 1.772 * 8192 */ const __m128i x_c32 = _mm_set1_epi16(0x38b4); const mlib_s32 c32 = 0x38b4; /* -179.456 * 32 */ const __m128i x_coff0 = _mm_set1_epi16(0xe991); const mlib_s32 coff0 = (mlib_s32)0xffffe991; /* 135.45984 * 32 */ const __m128i x_coff1 = _mm_set1_epi16(0x10ef); const mlib_s32 coff1 = 0x10ef; /* -226.816 * 32 */ const __m128i x_coff2 = _mm_set1_epi16(0xe3a6); const mlib_s32 coff2 = (mlib_s32)0xffffe3a6; const __m128i x_a = _mm_set1_epi8(0xff); const __m128i x_zero = _mm_setzero_si128(); /* __m128i variables */ __m128i x_y, x_cb, x_cr, x_r, x_g, x_b, x_temp; __m128i x_y1, x_cb1, x_cr1, x_y2, x_cb2, x_cr2; __m128i x_r1, x_r2, x_g1, x_g2, x_b1, x_b2; __m128i x_abgrl, x_abgrh, x_grl, x_grh, x_abl, x_abh; /* pointers */ __m128i *px_y, *px_cb, *px_cr, *px_abgr; mlib_u8 *pabgr; /* other var */ mlib_s32 i, iTemp, iy1, icb1, icr1, ir1, ig1, ib1; px_y = (__m128i *)y; px_cb = (__m128i *)cb; px_cr = (__m128i *)cr; px_abgr = (__m128i *)abgr; i = 0; #ifdef __SUNPRO_C #pragma pipeloop(0) #endif /* __SUNPRO_C */ for (; i <= n - 16; i += 16) { x_y = _mm_loadu_si128(px_y); px_y++; x_y1 = _mm_unpacklo_epi8(x_y, x_zero); x_y2 = _mm_unpackhi_epi8(x_y, x_zero); x_cb = _mm_loadu_si128(px_cb); px_cb++; x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb); x_cb2 = _mm_unpackhi_epi8(x_zero, x_cb); x_cr = _mm_loadu_si128(px_cr); px_cr++; x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr); x_cr2 = _mm_unpackhi_epi8(x_zero, x_cr); /* lower half */ x_temp = _mm_mulhi_epu16(x_cr1, x_c13); x_r1 = _mm_add_epi16(x_temp, x_coff0); x_temp = _mm_srai_epi16(x_r1, 5); x_r1 = _mm_add_epi16(x_temp, x_y1); x_temp = _mm_mulhi_epu16(x_cb1, x_c22); x_g1 = _mm_mulhi_epu16(x_cr1, x_c23); x_temp = _mm_add_epi16(x_temp, x_g1); x_g1 = _mm_sub_epi16(x_coff1, x_temp); x_temp = _mm_srai_epi16(x_g1, 5); x_g1 = _mm_add_epi16(x_temp, x_y1); x_temp = _mm_mulhi_epu16(x_cb1, x_c32); x_b1 = _mm_add_epi16(x_temp, x_coff2); x_temp = _mm_srai_epi16(x_b1, 5); x_b1 = _mm_add_epi16(x_temp, x_y1); /* upper half */ x_temp = _mm_mulhi_epu16(x_cr2, x_c13); x_r2 = _mm_add_epi16(x_temp, x_coff0); x_temp = _mm_srai_epi16(x_r2, 5); x_r2 = _mm_add_epi16(x_temp, x_y2); x_temp = _mm_mulhi_epu16(x_cb2, x_c22); x_g2 = _mm_mulhi_epu16(x_cr2, x_c23); x_temp = _mm_add_epi16(x_temp, x_g2); x_g2 = _mm_sub_epi16(x_coff1, x_temp); x_temp = _mm_srai_epi16(x_g2, 5); x_g2 = _mm_add_epi16(x_temp, x_y2); x_temp = _mm_mulhi_epu16(x_cb2, x_c32); x_b2 = _mm_add_epi16(x_temp, x_coff2); x_temp = _mm_srai_epi16(x_b2, 5); x_b2 = _mm_add_epi16(x_temp, x_y2); /* pack */ x_b = _mm_packus_epi16(x_b1, x_b2); x_r = _mm_packus_epi16(x_r1, x_r2); x_g = _mm_packus_epi16(x_g1, x_g2); /* create rgb sequences */ x_abl = _mm_unpacklo_epi8(x_a, x_b); x_abh = _mm_unpackhi_epi8(x_a, x_b); x_grl = _mm_unpacklo_epi8(x_g, x_r); x_grh = _mm_unpackhi_epi8(x_g, x_r); /* save */ x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl); _mm_storeu_si128(px_abgr++, x_abgrl); x_abgrh = _mm_unpackhi_epi16(x_abl, x_grl); _mm_storeu_si128(px_abgr++, x_abgrh); x_abgrl = _mm_unpacklo_epi16(x_abh, x_grh); _mm_storeu_si128(px_abgr++, x_abgrl); x_abgrh = _mm_unpackhi_epi16(x_abh, x_grh); _mm_storeu_si128(px_abgr++, x_abgrh); } if (i <= n - 8) { x_y = _mm_loadl_epi64(px_y); px_y = (__m128i *) (((__m64 *)px_y) + 1); x_y1 = _mm_unpacklo_epi8(x_y, x_zero); x_cb = _mm_loadl_epi64(px_cb); px_cb = (__m128i *) (((__m64 *)px_cb) + 1); x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb); x_cr = _mm_loadl_epi64(px_cr); px_cr = (__m128i *) (((__m64 *)px_cr) + 1); x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr); /* lower half only */ x_temp = _mm_mulhi_epu16(x_cr1, x_c13); x_r1 = _mm_add_epi16(x_temp, x_coff0); x_temp = _mm_srai_epi16(x_r1, 5); x_r1 = _mm_add_epi16(x_temp, x_y1); x_temp = _mm_mulhi_epu16(x_cb1, x_c22); x_g1 = _mm_mulhi_epu16(x_cr1, x_c23); x_temp = _mm_add_epi16(x_temp, x_g1); x_g1 = _mm_sub_epi16(x_coff1, x_temp); x_temp = _mm_srai_epi16(x_g1, 5); x_g1 = _mm_add_epi16(x_temp, x_y1); x_temp = _mm_mulhi_epu16(x_cb1, x_c32); x_b1 = _mm_add_epi16(x_temp, x_coff2); x_temp = _mm_srai_epi16(x_b1, 5); x_b1 = _mm_add_epi16(x_temp, x_y1); /* pack */ x_b = _mm_packus_epi16(x_b1, x_zero); x_r = _mm_packus_epi16(x_r1, x_zero); x_g = _mm_packus_epi16(x_g1, x_zero); /* create rgb sequences */ x_abl = _mm_unpacklo_epi8(x_a, x_b); x_grl = _mm_unpacklo_epi8(x_g, x_r); /* save */ x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl); _mm_storeu_si128(px_abgr++, x_abgrl); x_abgrh = _mm_unpackhi_epi16(x_abl, x_grl); _mm_storeu_si128(px_abgr++, x_abgrh); i += 8; } if (i <= n - 4) { iTemp = *((mlib_s32 *)px_y); x_y = _mm_cvtsi32_si128(iTemp); px_y = (__m128i *) (((mlib_s32 *)px_y) + 1); x_y1 = _mm_unpacklo_epi8(x_y, x_zero); iTemp = *((mlib_s32 *)px_cb); x_cb = _mm_cvtsi32_si128(iTemp); px_cb = (__m128i *) (((mlib_s32 *)px_cb) + 1); x_cb1 = _mm_unpacklo_epi8(x_zero, x_cb); iTemp = *((mlib_s32 *)px_cr); x_cr = _mm_cvtsi32_si128(iTemp); px_cr = (__m128i *) (((mlib_s32 *)px_cr) + 1); x_cr1 = _mm_unpacklo_epi8(x_zero, x_cr); /* 64 0f lower half only */ x_temp = _mm_mulhi_epu16(x_cr1, x_c13); x_r1 = _mm_add_epi16(x_temp, x_coff0); x_temp = _mm_srai_epi16(x_r1, 5); x_r1 = _mm_add_epi16(x_temp, x_y1); x_temp = _mm_mulhi_epu16(x_cb1, x_c22); x_g1 = _mm_mulhi_epu16(x_cr1, x_c23); x_temp = _mm_add_epi16(x_temp, x_g1); x_g1 = _mm_sub_epi16(x_coff1, x_temp); x_temp = _mm_srai_epi16(x_g1, 5); x_g1 = _mm_add_epi16(x_temp, x_y1); x_temp = _mm_mulhi_epu16(x_cb1, x_c32); x_b1 = _mm_add_epi16(x_temp, x_coff2); x_temp = _mm_srai_epi16(x_b1, 5); x_b1 = _mm_add_epi16(x_temp, x_y1); /* pack */ x_b = _mm_packus_epi16(x_b1, x_zero); x_r = _mm_packus_epi16(x_r1, x_zero); x_g = _mm_packus_epi16(x_g1, x_zero); /* create rgb sequences */ x_abl = _mm_unpacklo_epi8(x_a, x_b); x_grl = _mm_unpacklo_epi8(x_g, x_r); /* save */ x_abgrl = _mm_unpacklo_epi16(x_abl, x_grl); _mm_storeu_si128(px_abgr++, x_abgrl); i += 4; } /* pure C implementation */ pabgr = (mlib_u8 *)px_abgr; for (; i < n; i++) { iy1 = y[i]; icb1 = cb[i]; icr1 = cr[i]; iTemp = (icr1 * c13) >> 8; ir1 = (iTemp + coff0) >> 5; ir1 += iy1; iTemp = (icb1 * c22) >> 8; ig1 = (icr1 * c23) >> 8; iTemp += ig1; ig1 = coff1 - iTemp; iTemp = ig1 >> 5; ig1 = iTemp + iy1; iTemp = (icb1 * c32) >> 8; ib1 = iTemp + coff2; iTemp = ib1 >> 5; ib1 = iTemp + iy1; pabgr[0] = 0xff; CLAMP_U8(ib1, pabgr[1]); CLAMP_U8(ig1, pabgr[2]); CLAMP_U8(ir1, pabgr[3]); pabgr += 4; } return (MLIB_SUCCESS); }
template<bool align> SIMD_INLINE __m128i AverageRow16(const Buffer & buffer, size_t offset) { return _mm_mulhi_epu16(K16_DIVISION_BY_9_FACTOR, _mm_add_epi16( _mm_add_epi16(K16_0005, Load<align>((__m128i*)(buffer.src0 + offset))), _mm_add_epi16(Load<align>((__m128i*)(buffer.src1 + offset)), Load<align>((__m128i*)(buffer.src2 + offset))))); }
static uint32_t * ssse3_fetch_bilinear_cover (pixman_iter_t *iter, const uint32_t *mask) { pixman_fixed_t fx, ux; bilinear_info_t *info = iter->data; line_t *line0, *line1; int y0, y1; int32_t dist_y; __m128i vw; int i; fx = info->x; ux = iter->image->common.transform->matrix[0][0]; y0 = pixman_fixed_to_int (info->y); y1 = y0 + 1; line0 = &info->lines[y0 & 0x01]; line1 = &info->lines[y1 & 0x01]; if (line0->y != y0) { ssse3_fetch_horizontal ( &iter->image->bits, line0, y0, fx, ux, iter->width); } if (line1->y != y1) { ssse3_fetch_horizontal ( &iter->image->bits, line1, y1, fx, ux, iter->width); } dist_y = pixman_fixed_to_bilinear_weight (info->y); dist_y <<= (16 - BILINEAR_INTERPOLATION_BITS); vw = _mm_set_epi16 ( dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y, dist_y); for (i = 0; i + 3 < iter->width; i += 4) { __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); __m128i top1 = _mm_load_si128 ((__m128i *)(line0->buffer + i + 2)); __m128i bot1 = _mm_load_si128 ((__m128i *)(line1->buffer + i + 2)); __m128i r0, r1, tmp, p; r0 = _mm_mulhi_epu16 ( _mm_sub_epi16 (bot0, top0), vw); tmp = _mm_cmplt_epi16 (bot0, top0); tmp = _mm_and_si128 (tmp, vw); r0 = _mm_sub_epi16 (r0, tmp); r0 = _mm_add_epi16 (r0, top0); r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ r1 = _mm_mulhi_epu16 ( _mm_sub_epi16 (bot1, top1), vw); tmp = _mm_cmplt_epi16 (bot1, top1); tmp = _mm_and_si128 (tmp, vw); r1 = _mm_sub_epi16 (r1, tmp); r1 = _mm_add_epi16 (r1, top1); r1 = _mm_srli_epi16 (r1, BILINEAR_INTERPOLATION_BITS); r1 = _mm_shuffle_epi32 (r1, _MM_SHUFFLE (2, 0, 3, 1)); /* r1: A3 R3 G3 B3 A2 R2 G2 B2 */ p = _mm_packus_epi16 (r0, r1); _mm_storeu_si128 ((__m128i *)(iter->buffer + i), p); } while (i < iter->width) { __m128i top0 = _mm_load_si128 ((__m128i *)(line0->buffer + i)); __m128i bot0 = _mm_load_si128 ((__m128i *)(line1->buffer + i)); __m128i r0, tmp, p; r0 = _mm_mulhi_epu16 ( _mm_sub_epi16 (bot0, top0), vw); tmp = _mm_cmplt_epi16 (bot0, top0); tmp = _mm_and_si128 (tmp, vw); r0 = _mm_sub_epi16 (r0, tmp); r0 = _mm_add_epi16 (r0, top0); r0 = _mm_srli_epi16 (r0, BILINEAR_INTERPOLATION_BITS); /* r0: A0 R0 A1 R1 G0 B0 G1 B1 */ r0 = _mm_shuffle_epi32 (r0, _MM_SHUFFLE (2, 0, 3, 1)); /* r0: A1 R1 G1 B1 A0 R0 G0 B0 */ p = _mm_packus_epi16 (r0, r0); if (iter->width - i == 1) { *(uint32_t *)(iter->buffer + i) = _mm_cvtsi128_si32 (p); i++; } else { _mm_storel_epi64 ((__m128i *)(iter->buffer + i), p); i += 2; } } info->y += iter->image->common.transform->matrix[1][1]; return iter->buffer; }