Exemplo n.º 1
0
static void ConvertARGBToUV_NEON(const uint32_t* argb, uint8_t* u, uint8_t* v,
                                 int src_width, int do_store) {
  int i;
  for (i = 0; i + 16 <= src_width; i += 16, u += 8, v += 8) {
    const uint8x16x4_t RGB = vld4q_u8((const uint8_t*)&argb[i]);
    const uint16x8_t R = vpaddlq_u8(RGB.val[2]);  // pair-wise adds
    const uint16x8_t G = vpaddlq_u8(RGB.val[1]);
    const uint16x8_t B = vpaddlq_u8(RGB.val[0]);
    int16x8_t U_tmp, V_tmp;
    CONVERT_RGB_TO_UV(R, G, B, 1, U_tmp, V_tmp);
    {
      const uint8x8_t U = vqrshrun_n_s16(U_tmp, 1);
      const uint8x8_t V = vqrshrun_n_s16(V_tmp, 1);
      if (do_store) {
        vst1_u8(u, U);
        vst1_u8(v, V);
      } else {
        const uint8x8_t prev_u = vld1_u8(u);
        const uint8x8_t prev_v = vld1_u8(v);
        vst1_u8(u, vrhadd_u8(U, prev_u));
        vst1_u8(v, vrhadd_u8(V, prev_v));
      }
    }
  }
  if (i < src_width) {  // left-over
    WebPConvertARGBToUV_C(argb + i, u, v, src_width - i, do_store);
  }
}
Exemplo n.º 2
0
void dmz_deinterleave_RGBA_to_R(uint8_t *source, uint8_t *dest, int size) {
#if DMZ_HAS_NEON_COMPILETIME
  if (dmz_has_neon_runtime()) {
    assert(size >= 16); // required for the vectorized handling of leftover_bytes; also, a reasonable expectation!

    for (int offset = 0; offset + 15 < size; offset += 16) {
      uint8x16x4_t r1 = vld4q_u8(&source[offset * 4]);
      vst1q_u8(&dest[offset], r1.val[0]);
    }

    // use "overlapping" to process the remaining bytes
    // See http://community.arm.com/groups/processors/blog/2010/05/10/coding-for-neon--part-2-dealing-with-leftovers
    if (size % 16 > 0) {
      int offset = size - 16;
      uint8x16x4_t r1 = vld4q_u8(&source[offset * 4]);
      vst1q_u8(&dest[offset], r1.val[0]);
    }
  }
  else
#endif
  {
    for (int offset = 0; offset + 7 < size; offset += 8) {
      int bufferOffset = offset * 4;
      dest[offset] = source[bufferOffset];
      dest[offset + 1] = source[bufferOffset + (1 * 4)];
      dest[offset + 2] = source[bufferOffset + (2 * 4)];
      dest[offset + 3] = source[bufferOffset + (3 * 4)];
      dest[offset + 4] = source[bufferOffset + (4 * 4)];
      dest[offset + 5] = source[bufferOffset + (5 * 4)];
      dest[offset + 6] = source[bufferOffset + (6 * 4)];
      dest[offset + 7] = source[bufferOffset + (7 * 4)];
    }
    
    int leftover_bytes = size % 8; // each RGBA pixel is 4 bytes, so can assume size % 4 == 0
    if (leftover_bytes > 0) {
      for (int offset = size - leftover_bytes; offset < size; offset += 4) {
        int bufferOffset = offset * 4;
        dest[offset] = source[bufferOffset];
        dest[offset + 1] = source[bufferOffset + (1 * 4)];
        dest[offset + 2] = source[bufferOffset + (2 * 4)];
        dest[offset + 3] = source[bufferOffset + (3 * 4)];
      }
    }
  }
}
Exemplo n.º 3
0
uint64_t popcnt_neon_vcnt(const uint8_t* data, const size_t size)
{
    const size_t chunk_size = 16 * 4 * 2;

    uint8_t* ptr = const_cast<uint8_t*>(data);

    const size_t n = size / chunk_size;
    const size_t k = size % chunk_size;

    uint32x4_t sum = vcombine_u32(vcreate_u32(0), vcreate_u32(0));

    for (size_t i=0; i < n; i++, ptr += chunk_size) {

        uint8x16x4_t input0 = vld4q_u8(ptr + 0 * 16 * 4);
        uint8x16x4_t input1 = vld4q_u8(ptr + 1 * 16 * 4);

        uint8x16_t t0   = vcntq_u8(input0.val[0]);
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[1]));
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[2]));
        t0 = vaddq_u8(t0, vcntq_u8(input0.val[3]));

        t0 = vaddq_u8(t0, vcntq_u8(input1.val[0]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[1]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[2]));
        t0 = vaddq_u8(t0, vcntq_u8(input1.val[3]));

        const uint16x8_t t1 = vpaddlq_u8(t0);

        sum = vpadalq_u16(sum, t1);
    }

    uint32_t scalar = 0;
    uint32_t tmp[4];

    vst1q_u32(tmp, sum);
    for (int i=0; i < 4; i++) {
        scalar += tmp[i];
    }

    for (size_t j=0; j < k; j++) {
        scalar += lookup8bit[ptr[j]];
    }

    return scalar;
}
Exemplo n.º 4
0
static void ConvertBGRAToRGB(const uint32_t* src,
                             int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    const uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    const uint8x16x3_t tmp = { { pixel.val[2], pixel.val[1], pixel.val[0] } };
    vst3q_u8(dst, tmp);
    dst += 48;
  }
  VP8LConvertBGRAToRGB_C(src, num_pixels & 15, dst);  // left-overs
}
Exemplo n.º 5
0
// gcc 4.6.0 had some trouble (NDK-r9) with this code. We only use it for
// gcc-4.8.x at least.
static void ConvertBGRAToRGBA(const uint32_t* src,
                              int num_pixels, uint8_t* dst) {
  const uint32_t* const end = src + (num_pixels & ~15);
  for (; src < end; src += 16) {
    uint8x16x4_t pixel = vld4q_u8((uint8_t*)src);
    // swap B and R. (VSWP d0,d2 has no intrinsics equivalent!)
    const uint8x16_t tmp = pixel.val[0];
    pixel.val[0] = pixel.val[2];
    pixel.val[2] = tmp;
    vst4q_u8(dst, pixel);
    dst += 64;
  }
  VP8LConvertBGRAToRGBA_C(src, num_pixels & 15, dst);  // left-overs
}
Exemplo n.º 6
0
void FilterEffect::forceValidPreMultipliedPixels()
{
    // Must operate on pre-multiplied results; other formats cannot have invalid pixels.
    if (!m_premultipliedImageResult)
        return;

    Uint8ClampedArray* imageArray = m_premultipliedImageResult.get();
    unsigned char* pixelData = imageArray->data();
    int pixelArrayLength = imageArray->length();

    // We must have four bytes per pixel, and complete pixels
    ASSERT(!(pixelArrayLength % 4));

#if HAVE(ARM_NEON_INTRINSICS)
    if (pixelArrayLength >= 64) {
        unsigned char* lastPixel = pixelData + (pixelArrayLength & ~0x3f);
        do {
            // Increments pixelData by 64.
            uint8x16x4_t sixteenPixels = vld4q_u8(pixelData);
            sixteenPixels.val[0] = vminq_u8(sixteenPixels.val[0], sixteenPixels.val[3]);
            sixteenPixels.val[1] = vminq_u8(sixteenPixels.val[1], sixteenPixels.val[3]);
            sixteenPixels.val[2] = vminq_u8(sixteenPixels.val[2], sixteenPixels.val[3]);
            vst4q_u8(pixelData, sixteenPixels);
            pixelData += 64;
        } while (pixelData < lastPixel);

        pixelArrayLength &= 0x3f;
        if (!pixelArrayLength)
            return;
    }
#endif

    int numPixels = pixelArrayLength / 4;

    // Iterate over each pixel, checking alpha and adjusting color components if necessary
    while (--numPixels >= 0) {
        // Alpha is the 4th byte in a pixel
        unsigned char a = *(pixelData + 3);
        // Clamp each component to alpha, and increment the pixel location
        for (int i = 0; i < 3; ++i) {
            if (*pixelData > a)
                *pixelData = a;
            ++pixelData;
        }
        // Increment for alpha
        ++pixelData;
    }
}
Exemplo n.º 7
0
/* Routine optimized for shuffling a buffer for a type size of 4 bytes. */
static void
shuffle4_neon(uint8_t* const dest, const uint8_t* const src,
              const size_t vectorizable_elements, const size_t total_elements) {
  size_t i, j, k;
  static const size_t bytesoftype = 4;
  uint8x16x4_t r0;

  for (i = 0, k = 0; i < vectorizable_elements * bytesoftype; i += 64, k++) {
    /* Load (and permute) 64 bytes to the structure r0 */
    r0 = vld4q_u8(src + i);
    /* Store the results in the destination vector */
    for (j = 0; j < 4; j++) {
      vst1q_u8(dest + total_elements * j + k * 16, r0.val[j]);
    }
  }
}
Exemplo n.º 8
0
inline  uint8x16x4_t vld4q(const u8  * ptr) { return  vld4q_u8(ptr); }
Exemplo n.º 9
0
void test_vld4Qu8 (void)
{
  uint8x16x4_t out_uint8x16x4_t;

  out_uint8x16x4_t = vld4q_u8 (0);
}