Esempio n. 1
0
File: ops.c Progetto: petecoup/argon
void ar_stride2_vmaxall_u8_neon(const uint8_t* a,
                                uint32_t n,
                                uint8_t* line_results)
{
   uint8x16x2_t a_loaded;
   uint8x16_t line0_max = vdupq_n_u8(0);
   uint8x16_t line1_max = vdupq_n_u8(0);
   line_results[0] = 0;
   line_results[1] = 0;

   uint8_t line0_array[16];
   uint8_t line1_array[16];

   for (uint32_t i = 0; i < n; i += 32) {
      a_loaded = vld2q_u8(&(a[i]));
      line0_max = vmaxq_u8(a_loaded.val[0], line0_max);
      line1_max = vmaxq_u8(a_loaded.val[1], line1_max);
   }

   vst1q_u8(line0_array, line0_max);
   vst1q_u8(line1_array, line1_max);
   
   for (uint32_t i = 0; i < 16; i++) {
      line_results[0] = ar_max_u8(line_results[0], line0_array[i]);
      line_results[1] = ar_max_u8(line_results[1], line1_array[i]);
   }
}
Esempio n. 2
0
/* Routine optimized for shuffling a buffer for a type size of 2 bytes. */
static void
shuffle2_neon(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  size_t i, j, k;
  static const size_t bytesoftype = 2;
  uint8x16x2_t r0;

  for (i = 0, k = 0; i < vectorizable_elements * bytesoftype; i += 32, k++) {
    /* Load (and permute) 32 bytes to the structure r0 */
    r0 = vld2q_u8(src + i);
    /* Store the results in the destination vector */
    for (j = 0; j < 2; j++) {
      vst1q_u8(dest + total_elements * j + k * 16, r0.val[j]);
    }
  }
}
Esempio n. 3
0
void test_vld2Qu8 (void)
{
  uint8x16x2_t out_uint8x16x2_t;

  out_uint8x16x2_t = vld2q_u8 (0);
}
Esempio n. 4
0
                out->buf[oy*out->stride+ox] = pad;
        }
    }

    return out;
}

#ifdef __ARM_NEON__
#include <arm_neon.h>

void neon_decimate2(uint8_t * __restrict dest, int destwidth, int destheight, int deststride,
               uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride)
{
    for (int y = 0; y < destheight; y++) {
        for (int x = 0; x < destwidth; x+=8) {
            uint8x16x2_t row0 = vld2q_u8(src + 2*x);
            uint8x16x2_t row1 = vld2q_u8(src + 2*x + srcstride);
            uint8x16_t sum0 = vhaddq_u8(row0.val[0], row1.val[1]);
            uint8x16_t sum1 = vhaddq_u8(row1.val[0], row0.val[1]);
            uint8x16_t sum = vhaddq_u8(sum0, sum1);
            vst1q_u8(dest + x, sum);
        }
        src += 2*srcstride;
        dest += deststride;
    }
}

void neon_decimate3(uint8_t * __restrict dest, int destwidth, int destheight, int deststride,
                    uint8_t * __restrict src, int srcwidth, int srcheight, int srcstride)
{
    for (int y = 0; y < destheight; y++) {
Esempio n. 5
0
inline  uint8x16x2_t vld2q(const u8  * ptr) { return  vld2q_u8(ptr); }
Esempio n. 6
0
void combineUYVY(const Size2D &size,
                 const u8 * srcyBase, ptrdiff_t srcyStride,
                 const u8 * srcuBase, ptrdiff_t srcuStride,
                 const u8 * srcvBase, ptrdiff_t srcvStride,
                 u8 * dstBase, ptrdiff_t dstStride)
{
    internal::assertSupportedConfiguration();
#ifdef CAROTENE_NEON
#ifndef __ANDROID__
    size_t roiw32 = size.width >= 31 ? size.width - 31 : 0;
#endif
    size_t roiw8 = size.width >= 7 ? size.width - 7 : 0;

    for (size_t i = 0u; i < size.height; ++i)
    {
        const u8 * srcy = internal::getRowPtr(srcyBase, srcyStride, i);
        const u8 * srcu = internal::getRowPtr(srcuBase, srcuStride, i);
        const u8 * srcv = internal::getRowPtr(srcvBase, srcvStride, i);
        u8 * dst = internal::getRowPtr(dstBase, dstStride, i);
        size_t syj = 0u, sj = 0u, dj = 0u;

#ifndef __ANDROID__
        for (; sj < roiw32; sj += 32, syj += 64, dj += 128)
        {
            internal::prefetch(srcy + syj);
            internal::prefetch(srcu + sj);
            internal::prefetch(srcv + sj);

            uint8x16x2_t v_y = vld2q_u8(srcy + syj);
            uint8x16x4_t v_dst;
            v_dst.val[0] = vld1q_u8(srcu + sj);
            v_dst.val[1] = v_y.val[0];
            v_dst.val[2] = vld1q_u8(srcv + sj);
            v_dst.val[3] = v_y.val[1];
            vst4q_u8(dst + dj, v_dst);

            v_y = vld2q_u8(srcy + syj + 32);
            v_dst.val[0] = vld1q_u8(srcu + sj + 16);
            v_dst.val[1] = v_y.val[0];
            v_dst.val[2] = vld1q_u8(srcv + sj + 16);
            v_dst.val[3] = v_y.val[1];
            vst4q_u8(dst + dj + 64, v_dst);
        }
#endif

        for (; sj < roiw8; sj += 8, syj += 16, dj += 32)
        {
            uint8x8x2_t v_y = vld2_u8(srcy + syj);
            uint8x8x4_t v_dst;
            v_dst.val[0] = vld1_u8(srcu + sj);
            v_dst.val[1] = v_y.val[0];
            v_dst.val[2] = vld1_u8(srcv + sj);
            v_dst.val[3] = v_y.val[1];
            vst4_u8(dst + dj, v_dst);
        }

        for (; sj < size.width; ++sj, syj += 2, dj += 4)
        {
            dst[dj] = srcu[sj];
            dst[dj + 1] = srcy[syj];
            dst[dj + 2] = srcv[sj];
            dst[dj + 3] = srcy[syj + 1];
        }
    }
#else
    (void)size;
    (void)srcyBase;
    (void)srcyStride;
    (void)srcuBase;
    (void)srcuStride;
    (void)srcvBase;
    (void)srcvStride;
    (void)dstBase;
    (void)dstStride;
#endif
}