예제 #1
0
// about twice as fast as generic
void MipMap_32_neon( int width, int height, unsigned char *source, unsigned char *target )
{
    if(width < 8) {
        MipMap_32_generic(width, height, source, target);
        return;
    }

    int newwidth = width / 2;
    int newheight = height / 2;
    int stride = width * 4;
    
    unsigned char *s = target;
    unsigned char *t = source;
    unsigned char *u = t+stride;

    int y, x;
    for( y = 0; y < newheight; y++ ) {
        for( x = 0; x < newwidth; x+=4 ) {
            uint8x16_t a0, a1, a2, a3;

            memcpy(&a0, t,    16);
            memcpy(&a1, t+16, 16);
            memcpy(&a2, u,    16);
            memcpy(&a3, u+16, 16);

            // average first and second scan lines
            a0 = vhaddq_u8(a0, a2);
            a1 = vhaddq_u8(a1, a3);

            // repack
            uint32x4x2_t z = vuzpq_u32(vreinterpretq_u32_u8(a0), vreinterpretq_u32_u8(a1));
            uint8x16_t d0, d1;
            memcpy(&d0, &z.val[0], 16), memcpy(&d1, &z.val[1], 16);

            // average even and odd x pixels
            a0 = vhaddq_u8(vreinterpretq_u8_u32(z.val[0]), vreinterpretq_u8_u32(z.val[1]));

            memcpy(s, &a0, 16);

            s+=16;
            t+=32;
            u+=32;
        }
        t += stride;
        u += stride;
    }
}
예제 #2
0
파일: vhaddQu8.c 프로젝트: AlexMioMio/gcc
void test_vhaddQu8 (void)
{
  uint8x16_t out_uint8x16_t;
  uint8x16_t arg0_uint8x16_t;
  uint8x16_t arg1_uint8x16_t;

  out_uint8x16_t = vhaddq_u8 (arg0_uint8x16_t, arg1_uint8x16_t);
}
예제 #3
0
void vpx_d45_predictor_16x16_neon(uint8_t *dst, ptrdiff_t stride,
                                  const uint8_t *above, const uint8_t *left) {
  const uint8x16_t A0 = vld1q_u8(above);  // top row
  const uint8x16_t above_right = vld1q_dup_u8(above + 15);
  const uint8x16_t A1 = vextq_u8(A0, above_right, 1);
  const uint8x16_t A2 = vextq_u8(A0, above_right, 2);
  const uint8x16_t avg1 = vhaddq_u8(A0, A2);
  uint8x16_t row = vrhaddq_u8(avg1, A1);
  int i;
  (void)left;
  for (i = 0; i < 15; ++i) {
    vst1q_u8(dst + i * stride, row);
    row = vextq_u8(row, above_right, 1);
  }
  vst1q_u8(dst + i * stride, row);
}
예제 #4
0
// about twice as fast as generic
void MipMap_24_neon( int width, int height, unsigned char *source, unsigned char *target )
{
    if(width < 32) {
        MipMap_24_generic(width, height, source, target);
        return;
    }

    int newwidth = width / 2;
    int newheight = height / 2;
    int stride = width * 3;
    
    unsigned char *s = target;
    unsigned char *t = source;
    unsigned char *u = t+stride;

    int y, x;
    for( y = 0; y < newheight; y++ ) {
        for( x = 0; x < newwidth; x+=16 ) {
            uint8x16_t a0, a1, a2, a3, a4, a5, a6, a7, a8, a9, aa, ab;

            memcpy(&a0, t,    16);
            memcpy(&a1, t+16, 16);
            memcpy(&a2, t+32, 16);
            memcpy(&a3, t+48, 16);
            memcpy(&a4, t+64, 16);
            memcpy(&a5, t+80, 16);
            memcpy(&a6, u,    16);
            memcpy(&a7, u+16, 16);
            memcpy(&a8, u+32, 16);
            memcpy(&a9, u+48, 16);
            memcpy(&aa, u+64, 16);
            memcpy(&ab, u+80, 16);

            // average first and second scan lines
            a0 = vhaddq_u8(a0, a6);
            a1 = vhaddq_u8(a1, a7);
            a2 = vhaddq_u8(a2, a8);
            a3 = vhaddq_u8(a3, a9);
            a4 = vhaddq_u8(a4, aa);
            a5 = vhaddq_u8(a5, ab);

            // at this point, we have averaged the two scanlines,
            // in 24bit it's a bit of a pain to average the pixels
            // because they are aligned to 3 bytes
            
#if 0  // simple readable version finishing without neon (slower)
            uint8_t b[96], c[96];
            memcpy(b,    &a0, 16);
            memcpy(b+16, &a1, 16);
            memcpy(b+32, &a2, 16);
            memcpy(b+48, &a3, 16);
            memcpy(b+64, &a4, 16);
            memcpy(b+80, &a5, 16);

            int i,j;
            for( i=0; i<16; i++)
                for( j=0; j<3; j++)
                    s[3*i+j] = (b[3*2*i+j] + b[3*(2*i+1)+j]) / 2;
#else
            // full neon version with swizzel (ugly but fast)
            uint8x8_t r0, r1;
            
            uint8x8x4_t z;
            uint8x8x2_t *z2 = (uint8x8x2_t*)&z, *z3 = (uint8x8x2_t*)&z+1;

//          a00 a01 a02 a06 a07 a08 a0c a0d a0e a12 a13 a14 a18 a19 a1a a1e
//          a03 a04 a05 a09 a0a a0b a0f a10 a11 a15 a16 a17 a1b a1c a1d a21
#define int8x16_to_8x8x2(v) ((int8x8x2_t) { vget_low_s8(v), vget_high_s8(v) })
            
            uint8x8_t s0l = {0, 1, 2, 6, 7, 8, 12, 13};
            memcpy(&z, &a0, sizeof a0);
            r0 = vtbl2_u8(*z2, s0l);

            memcpy(z3, &a1, sizeof a1);
            uint8x8_t s0h = {14, 16+2, 16+3, 16+4, 16+8, 16+9, 16+10, 16+14};
            r1 = vtbl4_u8(z, s0h);
            a6 = vcombine_u8 (r0, r1);

            uint8x8_t s1l = {3, 4, 5, 9, 10, 11, 15, 16+0};
            r0 = vtbl4_u8(z, s1l);
            memcpy(&z, &a2, sizeof a2);
            uint8x8_t s1h = {16+1, 16+5, 16+6, 16+7, 16+11, 16+12, 16+13, 1};
            r1 = vtbl4_u8(z, s1h);
            a7 = vcombine_u8 (r0, r1);

            a0 = vhaddq_u8(a6, a7);

//          a1f a20 a24 a25 a26 a2a a2b a2c a30 a31 a32 a36 a37 a38 a3c a3d
//          a22 a23 a27 a28 a29 a2d a2e a2f a33 a34 a35 a39 a3a a3b a3f a40

            uint8x8_t s2l = {16+15, 0, 4, 5, 6, 10, 11, 12};
            r0 = vtbl4_u8(z, s2l);
            uint8x8_t s2h = {0, 1, 2, 6, 7, 8, 12, 13};
            memcpy(z3, &a3, sizeof a3);
            r1 = vtbl2_u8(*z3, s2h);
            a6 = vcombine_u8 (r0, r1);

            uint8x8_t s3l = {2, 3, 7, 8, 9, 13, 14, 15};
            r0 = vtbl2_u8(*z2, s3l);
            memcpy(&z, &a4, sizeof a4);
            uint8x8_t s3h = {16+3, 16+4, 16+5, 16+9, 16+10, 16+11, 16+15, 0};
            r1 = vtbl4_u8(z, s3h);
            a7 = vcombine_u8 (r0, r1);

            a1 = vhaddq_u8(a6, a7);

//          a3e a42 a43 a44 a48 a49 a4a a4e a4f a50 a54 a55 a56 a5a a5b a5c
//          a41 a45 a46 a47 a4b a4c a4d a51 a52 a53 a57 a58 a59 a5d a5e a5f

            uint8x8_t s4l = {16+14, 2, 3, 4, 8, 9, 10, 14};
            r0 = vtbl4_u8(z, s4l);
            memcpy(z3, &a5, sizeof a5);
            uint8x8_t s4h = {15, 16+0, 16+4, 16+5, 16+6, 16+10, 16+11, 16+12};
            r1 = vtbl4_u8(z, s4h);
            a6 = vcombine_u8 (r0, r1);

            uint8x8_t s5l = {1, 5, 6, 7, 11, 12, 13, 16+1};
            r0 = vtbl4_u8(z, s5l);
            uint8x8_t s5h = {2, 3, 7, 8, 9, 13, 14, 15};
            r1 = vtbl2_u8(*z3, s5h);
            a7 = vcombine_u8 (r0, r1);

            a2 = vhaddq_u8(a6, a7);

            memcpy(s,    &a0, 16);
            memcpy(s+16, &a1, 16);
            memcpy(s+32, &a2, 16);
#endif
            s+=48;
            t+=96;
            u+=96;
        }
        t += stride;
        u += stride;
    }
}
예제 #5
0
    }

    return out;
}

#ifdef __ARM_NEON__
#include <arm_neon.h>

void neon_decimate2(uint8_t * __restrict dest, int destwidth, int destheight, int deststride,
               uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride)
{
    for (int y = 0; y < destheight; y++) {
        for (int x = 0; x < destwidth; x+=8) {
            uint8x16x2_t row0 = vld2q_u8(src + 2*x);
            uint8x16x2_t row1 = vld2q_u8(src + 2*x + srcstride);
            uint8x16_t sum0 = vhaddq_u8(row0.val[0], row1.val[1]);
            uint8x16_t sum1 = vhaddq_u8(row1.val[0], row0.val[1]);
            uint8x16_t sum = vhaddq_u8(sum0, sum1);
            vst1q_u8(dest + x, sum);
        }
        src += 2*srcstride;
        dest += deststride;
    }
}

void neon_decimate3(uint8_t * __restrict dest, int destwidth, int destheight, int deststride,
                    uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride)
{
    for (int y = 0; y < destheight; y++) {
        for (int x = 0; x < destwidth; x+=8) {
            uint8x16x3_t row0 = vld3q_u8(src + 3*x);