unsigned int mmx_hash_bucket_data(unsigned char *key, int size, int NoOfItems) { char *p, *end; __m64 v1, v2, s; int val; if (size < 8) return(fnv_data2bucket(key, size, NoOfItems)); p=key; end=key+size; _mm_empty(); // emms v1=_mm_set1_pi32(FNV_INIT_VAL); while ((end-p) > 7) { v2=_mm_setr_pi32(*p,*(p+4)); v1=_mm_add_pi16(v1, v2); v1=_mm_slli_pi32(v1, 3); p+=8; } val=_mm_cvtsi64_si32(v1); _mm_empty(); // emms if (val < 0) val=1-val; val =val % NoOfItems; return(val); }
static void TEST (void) { __m64_union u, s1, s2; __m64_union e; int i; s1.as_m64 = _mm_setr_pi32 (30, 90); s2.as_m64 = _mm_setr_pi32 (76, -100); u.as_m64 = test (s1.as_m64, s2.as_m64); for (i = 0; i < 2; i++) e.as_int[i] = s1.as_int[i] - s2.as_int[i]; if (u.as_m64 != e.as_m64) abort (); }
static void TEST (void) { __m64_union s1, s2; union128 u; float e[4] = {1000.0, -20000.0, 43.0, 546.0}; /* input signed in {1000, -20000, 43, 546}. */ s1.as_m64 = _mm_setr_pi32 (1000, -20000); s2.as_m64 = _mm_setr_pi32 (43, 546); u.x = test (s1.as_m64, s2.as_m64); if (check_union128 (u, e)) abort (); }
void r_dimpatchD_MMX(const DCanvas *const cvs, argb_t color, int alpha, int x1, int y1, int w, int h) { int x, y, i; argb_t *line; int invAlpha = 256 - alpha; int dpitch = cvs->pitch / sizeof(DWORD); line = (argb_t *)cvs->buffer + y1 * dpitch; int batches = w / 2; int remainder = w & 1; // MMX temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 blendAlpha = _mm_set_pi16(0, alpha, alpha, alpha); const __m64 blendInvAlpha = _mm_set_pi16(0, invAlpha, invAlpha, invAlpha); const __m64 blendColor = _mm_set_pi16(0, RPART(color), GPART(color), BPART(color)); const __m64 blendMult = _mm_mullo_pi16(blendColor, blendAlpha); for (y = y1; y < y1 + h; y++) { // MMX optimize the bulk in batches of 2 colors: for (i = 0, x = x1; i < batches; ++i, x += 2) { #if 1 const __m64 input = _mm_setr_pi32(line[x + 0], line[x + 1]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 input = *((__m64 *)line[x]); #endif const __m64 output = blend2vs1_mmx(input, blendMult, blendInvAlpha, upper8mask); #if 1 line[x+0] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*0)); line[x+1] = _mm_cvtsi64_si32(_mm_srli_si64(output, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)line[x]) = output; #endif } if (remainder) { // Pick up the remainder: for (; x < x1 + w; x++) { line[x] = alphablend1a(line[x], color, alpha); } } line += dpitch; } // Required to reset FP: _mm_empty(); }
void rtv_lucent4cols_MMX(byte *source, argb_t *dest, int bga, int fga) { // SSE2 temporaries: const __m64 upper8mask = _mm_set_pi16(0, 0xff, 0xff, 0xff); const __m64 fgAlpha = _mm_set_pi16(0, fga, fga, fga); const __m64 bgAlpha = _mm_set_pi16(0, bga, bga, bga); #if 1 const __m64 bgColors01 = _mm_setr_pi32(dest[0], dest[1]); #else const __m64 bgColors01 = *((__m64 *)&dest[0]); #endif const __m64 fgColors01 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[0]), rt_mapcolor<argb_t>(dcol.colormap, source[1]) ); const __m64 finalColors01 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors01, bgColors01), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors01, fgColors01), upper8mask), fgAlpha) ), 8 ) ); #if 1 const __m64 bgColors23 = _mm_setr_pi32(dest[2], dest[3]); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. const __m64 bgColors23 = *((__m64 *)&dest[2]); #endif const __m64 fgColors23 = _mm_setr_pi32( rt_mapcolor<argb_t>(dcol.colormap, source[2]), rt_mapcolor<argb_t>(dcol.colormap, source[3]) ); const __m64 finalColors23 = _mm_packs_pu16( _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpacklo_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ), _mm_srli_pi16( _mm_adds_pi16( _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(bgColors23, bgColors23), upper8mask), bgAlpha), _mm_mullo_pi16(_mm_and_si64(_mm_unpackhi_pi8(fgColors23, fgColors23), upper8mask), fgAlpha) ), 8 ) ); #if 1 dest[0] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*0)); dest[1] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors01, 32*1)); dest[2] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*0)); dest[3] = _mm_cvtsi64_si32(_mm_srli_si64(finalColors23, 32*1)); #else // NOTE(jsd): No guarantee of 64-bit alignment; cannot use. *((__m64 *)&dest[0]) = finalColors01; *((__m64 *)&dest[2]) = finalColors23; #endif // Required to reset FP: _mm_empty(); }
__m64 test_mm_setr_pi32(int a, int b) { // CHECK-LABEL: test_mm_setr_pi32 // CHECK: insertelement <2 x i32> // CHECK: insertelement <2 x i32> return _mm_setr_pi32(a, b); }