Пример #1
0
void syscall_gate() {
    __asm__("SYSTEM_CALL:");
        __asm__("pushl $0");
        store_reg();
        svc();
        restore_reg();
        __asm__("add $4, %esp");
        iret();
}
Пример #2
0
static void __stdcall
transpose_bgra2(const uint8_t* srcp, uint8_t* dstp, const int rowsize,
                const int height, const int src_pitch, const int dst_pitch) noexcept
{
    const int h = height / 4 * 4;
    const int r = rowsize / 32 * 32;

    const uint8_t* s = srcp;

    for (int y = 0; y < h; y += 4) {
        uint8_t* d = dstp + 4 * y;
        for (int x = 0; x < r; x += 32) {
            __m128i a03 = load_reg(s + x +  0 + 0 * src_pitch);
            __m128i a47 = load_reg(s + x + 16 + 0 * src_pitch);
            __m128i b03 = load_reg(s + x +  0 + 1 * src_pitch);
            __m128i b47 = load_reg(s + x + 16 + 1 * src_pitch);
            __m128i c03 = load_reg(s + x +  0 + 2 * src_pitch);
            __m128i c47 = load_reg(s + x + 16 + 2 * src_pitch);
            __m128i d03 = load_reg(s + x +  0 + 3 * src_pitch);
            __m128i d47 = load_reg(s + x + 16 + 3 * src_pitch);

            __m128i ab01 = unpacklo32(a03, b03);
            __m128i cd01 = unpacklo32(c03, d03);
            __m128i ab23 = unpackhi32(a03, b03);
            __m128i cd23 = unpackhi32(c03, d03);
            __m128i ab45 = unpacklo32(a47, b47);
            __m128i cd45 = unpacklo32(c47, d47);
            __m128i ab67 = unpackhi32(a47, b47);
            __m128i cd67 = unpackhi32(c47, d47);

            store_reg(d + 0 * dst_pitch, unpacklo64(ab01, cd01));
            store_reg(d + 1 * dst_pitch, unpackhi64(ab01, cd01));
            store_reg(d + 2 * dst_pitch, unpacklo64(ab23, cd23));
            store_reg(d + 3 * dst_pitch, unpackhi64(ab23, cd23));
            store_reg(d + 4 * dst_pitch, unpacklo64(ab45, cd45));
            store_reg(d + 5 * dst_pitch, unpackhi64(ab45, cd45));
            store_reg(d + 6 * dst_pitch, unpacklo64(ab67, cd67));
            store_reg(d + 7 * dst_pitch, unpackhi64(ab67, cd67));

            d += 8 * dst_pitch;
        }
        s += 4 * src_pitch;
    }

    if (r == rowsize && h == height)
        return;

    int width = rowsize / 4;
    const uint32_t* s4 = (uint32_t*)srcp;
    uint32_t* d4 = (uint32_t*)dstp;
    int sp = src_pitch / 4;
    int dp = dst_pitch / 4;

    for (int y = h; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            d4[y + x * dp] = s4[x + y * sp];
        }
    }

    if (r < rowsize) {
        for (int y = 0; y < height; ++y) {
            for (int x = r / 4; x < width; ++x) {
                d4[y + x * dp] = s4[x + y * sp];
            }
        }
    }

}
Пример #3
0
static void __stdcall
transpose_bgra(const uint8_t* srcp, uint8_t* dstp, const int rowsize,
               const int height, const int src_pitch, const int dst_pitch) noexcept
{
    const int h = height / 4 * 4;
    const int r = rowsize / 16 * 16;

    const uint8_t* s = srcp;

    for (int y = 0; y < h; y += 4) {
        uint8_t* d = dstp + 4 * y;
        for (int x = 0; x < r; x += 16) {
            __m128i s0 = load_reg(s + x + 0 * src_pitch);
            __m128i s1 = load_reg(s + x + 1 * src_pitch);
            __m128i s2 = load_reg(s + x + 2 * src_pitch);
            __m128i s3 = load_reg(s + x + 3 * src_pitch);

            __m128i ab01 = unpacklo32(s0, s1);
            __m128i ab23 = unpackhi32(s0, s1);
            __m128i cd01 = unpacklo32(s2, s3);
            __m128i cd23 = unpackhi32(s2, s3);

            __m128i abcd0 = unpacklo64(ab01, cd01);
            __m128i abcd1 = unpackhi64(ab01, cd01);
            __m128i abcd2 = unpacklo64(ab23, cd23);
            __m128i abcd3 = unpackhi64(ab23, cd23);

            store_reg(d + 0 * dst_pitch, abcd0);
            store_reg(d + 1 * dst_pitch, abcd1);
            store_reg(d + 2 * dst_pitch, abcd2);
            store_reg(d + 3 * dst_pitch, abcd3);

            d += 4 * dst_pitch;
        }
        s += 4 * src_pitch;
    }

    if (r == rowsize && h == height)
        return;

    int width = rowsize / 4;
    const uint32_t* s4 = (uint32_t*)srcp;
    uint32_t* d4 = (uint32_t*)dstp;
    int sp = src_pitch / 4;
    int dp = dst_pitch / 4;

    for (int y = h; y < height; ++y) {
        for (int x = 0; x < width; ++x) {
            d4[y + x * dp] = s4[x + y * sp];
        }
    }

    if (r < rowsize) {
        for (int y = 0; y < height; ++y) {
            for (int x = r / 4; x < width; ++x) {
                d4[y + x * dp] = s4[x + y * sp];
            }
        }
    }

}