void main() { a = _m_from_int( k ); printf( "int=%8.8lx m=%8.8lx%8.8lx\n", k, a._32[1], a._32[0] ); }
__m64 test90(int a) { // CHECK: movd return _m_from_int(a); }
__m64 test_m_from_int(int a) { // CHECK-LABEL: test_m_from_int // CHECK: insertelement <2 x i32> return _m_from_int(a); }
/* Fill a surface with a gradient which is generated by bilinearly interpolating between four corner color values. Can take a source surface and multiply it into the gradient, but if 'src' is NULL, it will generate the gradient without multiplying */ static void fillBlend( SDL_Surface *dst, SDL_Surface *src, BROGUE_DRAW_COLOR *color) { int x, y; int lr, lg, lb, rr, rg, rb; int ldr, ldg, ldb, rdr, rdg, rdb; int w, h; BROGUE_DRAW_COLOR ul = color[0]; BROGUE_DRAW_COLOR ur = color[1]; BROGUE_DRAW_COLOR bl = color[2]; BROGUE_DRAW_COLOR br = color[3]; #if defined(__MMX__) int mmx = SDL_HasMMX(); #endif w = dst->w; h = dst->h; if (src != NULL) { assert(dst->w == src->w); assert(dst->h == src->h); } lr = clamp(ul.red * 0xFFFF, 0, 0xFFFF); lg = clamp(ul.green * 0xFFFF, 0, 0xFFFF); lb = clamp(ul.blue * 0xFFFF, 0, 0xFFFF); rr = clamp(ur.red * 0xFFFF, 0, 0xFFFF); rg = clamp(ur.green * 0xFFFF, 0, 0xFFFF); rb = clamp(ur.blue * 0xFFFF, 0, 0xFFFF); ldr = (clamp(bl.red * 0xFFFF, 0, 0xFFFF) - lr) / h; ldg = (clamp(bl.green * 0xFFFF, 0, 0xFFFF) - lg) / h; ldb = (clamp(bl.blue * 0xFFFF, 0, 0xFFFF) - lb) / h; rdr = (clamp(br.red * 0xFFFF, 0, 0xFFFF) - rr) / h; rdg = (clamp(br.green * 0xFFFF, 0, 0xFFFF) - rg) / h; rdb = (clamp(br.blue * 0xFFFF, 0, 0xFFFF) - rb) / h; for (y = 0; y < h; y++) { unsigned char *pix; int dr, dg, db; int rpp, gpp, bpp, raccum, gaccum, baccum; pix = (unsigned char *)dst->pixels + dst->pitch * y; dr = rr - lr; dg = rg - lg; db = rb - lb; rpp = dr / w; gpp = dg / w; bpp = db / w; raccum = lr; gaccum = lg; baccum = lb; lr += ldr; lg += ldg; lb += ldb; rr += rdr; rg += rdg; rb += rdb; if (src != NULL) { unsigned char *src_pix = (unsigned char *)src->pixels + src->pitch * y; x = w; #if defined(__MMX__) /* MMX is significantly faster. Use it if the CPU supports it */ if (mmx) { __m64 mmx_zero = _m_from_int(0); long long ll_color = ((long long)0xFFFF << 48) | ((long long)raccum << 32) | ((long long)gaccum << 16) | ((long long)baccum); __m64 mmx_color = *(__m64 *)&ll_color; long long ll_pp = ((long long)(rpp & 0xFFFF) << 32) | ((long long)(gpp & 0xFFFF) << 16) | ((long long)(bpp & 0xFFFF)); __m64 mmx_pp = *(__m64 *)&ll_pp; while (x >= 2) { __m64 src_pair = *(__m64 *)src_pix; /* Separate the left pixel and right pixel */ __m64 left_pix = _mm_unpacklo_pi8(src_pair, mmx_zero); __m64 right_pix = _mm_unpackhi_pi8(src_pair, mmx_zero); /* Multiply the left source by the gradient color */ left_pix = _mm_mullo_pi16(left_pix, _mm_srli_pi16(mmx_color, 8)); /* Advance the gradient color for the next pixel */ mmx_color = _mm_add_pi16(mmx_color, mmx_pp); /* Multiply the right source by the gradient color */ right_pix = _mm_mullo_pi16(right_pix, _mm_srli_pi16(mmx_color, 8)); /* Advance the gradient */ mmx_color = _mm_add_pi16(mmx_color, mmx_pp); /* Recombine the pixels */ __m64 result_pix = _mm_packs_pu16( _mm_srli_pi16(left_pix, 8), _mm_srli_pi16(right_pix, 8)); *(__m64 *)pix = result_pix; src_pix += 8; pix += 8; x -= 2; } /* Extract the accumulated gradient value for the potential odd remaining pixel */ short *s_color = (short *)&mmx_color; raccum = s_color[2]; gaccum = s_color[1]; baccum = s_color[0]; } #endif /* The equivalent slow loop for odd pixels or CPUs without MMX */ while (x > 0) { #if SDL_BYTEORDER == SDL_LIL_ENDIAN pix[3] = src_pix[3]; pix[2] = (src_pix[2] * raccum) >> 16; pix[1] = (src_pix[1] * gaccum) >> 16; pix[0] = (src_pix[0] * baccum) >> 16; #else pix[0] = src_pix[0]; pix[1] = (src_pix[1] * raccum) >> 16; pix[2] = (src_pix[2] * gaccum) >> 16; pix[3] = (src_pix[3] * baccum) >> 16; #endif raccum += rpp; gaccum += gpp; baccum += bpp; src_pix += 4; pix += 4; x--; } } else {
void lines_scale2(const unsigned char *src, unsigned y, unsigned char *dst1, unsigned char *dst2, unsigned nPix) { const unsigned char *u = src + ((y-1) & 7)*sc2lines_width, *m = src + ((y+0) & 7)*sc2lines_width, *l = src + ((y+1) & 7)*sc2lines_width; for (unsigned i = 0; i < nPix; i += 4) { if (*(unsigned*)(u+i) ^ *(unsigned*)(l+i)) { __m64 mm = *(__m64*)(m+i-2); __m64 uu = *(__m64*)(u+i-2); __m64 ll = *(__m64*)(l+i-2); __m64 md = _mm_slli_si64(mm,8); __m64 mf = _mm_srli_si64(mm,8); __m64 maskall = _mm_or_si64(_mm_cmpeq_pi8(md,mf), _mm_cmpeq_pi8(uu,ll)); __m64 e0, e1, v1, v2; e0 = _mm_cmpeq_pi8(md,uu); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,uu); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(u+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst1 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); e0 = _mm_cmpeq_pi8(md,ll); e0 = _mm_andnot_si64(maskall, e0); e0 = _mm_srli_si64(e0,16); e0 = _mm_unpacklo_pi8(e0, _mm_setzero_si64()); e1 = _mm_cmpeq_pi8(mf,ll); e1 = _mm_andnot_si64(maskall, e1); e1 = _mm_srli_si64(e1,16); e1 = _mm_unpacklo_pi8(_mm_setzero_si64(), e1); e0 = _mm_or_si64(e0, e1); v1 = _m_from_int(*(unsigned*)(m+i)); v2 = _m_from_int(*(unsigned*)(l+i)); v1 = _mm_unpacklo_pi8(v1,v1); v2 = _mm_unpacklo_pi8(v2,v2); *(__m64*)(dst2 + 2*i) = _mm_or_si64( _mm_and_si64(e0,v2), _mm_andnot_si64(e0,v1) ); } else { __m64 v1 = _m_from_int(*(unsigned*)(m+i)); v1 = _mm_unpacklo_pi8(v1,v1); *(__m64*)(dst1 + 2*i) = v1; *(__m64*)(dst2 + 2*i) = v1; } } }