void syscall_gate() { __asm__("SYSTEM_CALL:"); __asm__("pushl $0"); store_reg(); svc(); restore_reg(); __asm__("add $4, %esp"); iret(); }
static void __stdcall transpose_bgra2(const uint8_t* srcp, uint8_t* dstp, const int rowsize, const int height, const int src_pitch, const int dst_pitch) noexcept { const int h = height / 4 * 4; const int r = rowsize / 32 * 32; const uint8_t* s = srcp; for (int y = 0; y < h; y += 4) { uint8_t* d = dstp + 4 * y; for (int x = 0; x < r; x += 32) { __m128i a03 = load_reg(s + x + 0 + 0 * src_pitch); __m128i a47 = load_reg(s + x + 16 + 0 * src_pitch); __m128i b03 = load_reg(s + x + 0 + 1 * src_pitch); __m128i b47 = load_reg(s + x + 16 + 1 * src_pitch); __m128i c03 = load_reg(s + x + 0 + 2 * src_pitch); __m128i c47 = load_reg(s + x + 16 + 2 * src_pitch); __m128i d03 = load_reg(s + x + 0 + 3 * src_pitch); __m128i d47 = load_reg(s + x + 16 + 3 * src_pitch); __m128i ab01 = unpacklo32(a03, b03); __m128i cd01 = unpacklo32(c03, d03); __m128i ab23 = unpackhi32(a03, b03); __m128i cd23 = unpackhi32(c03, d03); __m128i ab45 = unpacklo32(a47, b47); __m128i cd45 = unpacklo32(c47, d47); __m128i ab67 = unpackhi32(a47, b47); __m128i cd67 = unpackhi32(c47, d47); store_reg(d + 0 * dst_pitch, unpacklo64(ab01, cd01)); store_reg(d + 1 * dst_pitch, unpackhi64(ab01, cd01)); store_reg(d + 2 * dst_pitch, unpacklo64(ab23, cd23)); store_reg(d + 3 * dst_pitch, unpackhi64(ab23, cd23)); store_reg(d + 4 * dst_pitch, unpacklo64(ab45, cd45)); store_reg(d + 5 * dst_pitch, unpackhi64(ab45, cd45)); store_reg(d + 6 * dst_pitch, unpacklo64(ab67, cd67)); store_reg(d + 7 * dst_pitch, unpackhi64(ab67, cd67)); d += 8 * dst_pitch; } s += 4 * src_pitch; } if (r == rowsize && h == height) return; int width = rowsize / 4; const uint32_t* s4 = (uint32_t*)srcp; uint32_t* d4 = (uint32_t*)dstp; int sp = src_pitch / 4; int dp = dst_pitch / 4; for (int y = h; y < height; ++y) { for (int x = 0; x < width; ++x) { d4[y + x * dp] = s4[x + y * sp]; } } if (r < rowsize) { for (int y = 0; y < height; ++y) { for (int x = r / 4; x < width; ++x) { d4[y + x * dp] = s4[x + y * sp]; } } } }
static void __stdcall transpose_bgra(const uint8_t* srcp, uint8_t* dstp, const int rowsize, const int height, const int src_pitch, const int dst_pitch) noexcept { const int h = height / 4 * 4; const int r = rowsize / 16 * 16; const uint8_t* s = srcp; for (int y = 0; y < h; y += 4) { uint8_t* d = dstp + 4 * y; for (int x = 0; x < r; x += 16) { __m128i s0 = load_reg(s + x + 0 * src_pitch); __m128i s1 = load_reg(s + x + 1 * src_pitch); __m128i s2 = load_reg(s + x + 2 * src_pitch); __m128i s3 = load_reg(s + x + 3 * src_pitch); __m128i ab01 = unpacklo32(s0, s1); __m128i ab23 = unpackhi32(s0, s1); __m128i cd01 = unpacklo32(s2, s3); __m128i cd23 = unpackhi32(s2, s3); __m128i abcd0 = unpacklo64(ab01, cd01); __m128i abcd1 = unpackhi64(ab01, cd01); __m128i abcd2 = unpacklo64(ab23, cd23); __m128i abcd3 = unpackhi64(ab23, cd23); store_reg(d + 0 * dst_pitch, abcd0); store_reg(d + 1 * dst_pitch, abcd1); store_reg(d + 2 * dst_pitch, abcd2); store_reg(d + 3 * dst_pitch, abcd3); d += 4 * dst_pitch; } s += 4 * src_pitch; } if (r == rowsize && h == height) return; int width = rowsize / 4; const uint32_t* s4 = (uint32_t*)srcp; uint32_t* d4 = (uint32_t*)dstp; int sp = src_pitch / 4; int dp = dst_pitch / 4; for (int y = h; y < height; ++y) { for (int x = 0; x < width; ++x) { d4[y + x * dp] = s4[x + y * sp]; } } if (r < rowsize) { for (int y = 0; y < height; ++y) { for (int x = r / 4; x < width; ++x) { d4[y + x * dp] = s4[x + y * sp]; } } } }