void test_vpaddls16 (void)
{
    int32x2_t out_int32x2_t;
    int16x4_t arg0_int16x4_t;

    out_int32x2_t = vpaddl_s16 (arg0_int16x4_t);
}
Пример #2
0
/* return the sum of all elements in an array. This works by calculating 4 totals (one for each lane) and adding those at the end to get the final total */
int sum_array(int16_t *array, int size)
{
     /* initialize the accumulator vector to zero */
     int16x4_t acc = vdup_n_s16(0);
     int32x2_t acc1;
     int64x1_t acc2;
     /* this implementation assumes the size of the array is a multiple of 4 */
     assert((size % 4) == 0);
     /* counting backwards gives better code */
     for (; size != 0; size -= 4)
     {
          int16x4_t vec;
          /* load 4 values in parallel from the array */
          vec = vld1_s16(array);
          /* increment the array pointer to the next element */
          array += 4;
          /* add the vector to the accumulator vector */
          acc = vadd_s16(acc, vec);
      }
      /* calculate the total */
      acc1 = vpaddl_s16(acc);
      acc2 = vpaddl_s32(acc1);
      /* return the total as an integer */
      return (int)vget_lane_s64(acc2, 0);
}
Пример #3
0
static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
                                   const uint32_t* const c1,
                                   const uint32_t* const c2) {
  const uint8x8_t p0 = vreinterpret_u8_u64(vcreate_u64(*c0));
  const uint8x8_t p1 = vreinterpret_u8_u64(vcreate_u64(*c1));
  const uint8x8_t p2 = vreinterpret_u8_u64(vcreate_u64(*c2));
  const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
  const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
  const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
  const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
  const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
  const int32_t pa_minus_pb = vget_lane_s32(diff, 0);
  return (pa_minus_pb <= 0) ? *c0 : *c1;
}
Пример #4
0
static WEBP_INLINE uint32_t Select(const uint32_t* const c0,
                                   const uint32_t* const c1,
                                   const uint32_t* const c2) {
    const uint64x1_t C0 = { *c0, 0 }, C1 = { *c1, 0 }, C2 = { *c2, 0 };
    const uint8x8_t p0 = vreinterpret_u8_u64(C0);
    const uint8x8_t p1 = vreinterpret_u8_u64(C1);
    const uint8x8_t p2 = vreinterpret_u8_u64(C2);
    const uint8x8_t bc = vabd_u8(p1, p2);   // |b-c|
    const uint8x8_t ac = vabd_u8(p0, p2);   // |a-c|
    const int16x4_t sum_bc = vreinterpret_s16_u16(vpaddl_u8(bc));
    const int16x4_t sum_ac = vreinterpret_s16_u16(vpaddl_u8(ac));
    const int32x2_t diff = vpaddl_s16(vsub_s16(sum_bc, sum_ac));
    int32_t pa_minus_pb;
    vst1_lane_s32(&pa_minus_pb, diff, 0);
    return (pa_minus_pb <= 0) ? *c0 : *c1;
}