示例#1
0
文件: vcvtQs32_f32.c 项目: pjump/gcc
void test_vcvtQs32_f32 (void)
{
    int32x4_t out_int32x4_t;
    float32x4_t arg0_float32x4_t;

    out_int32x4_t = vcvtq_s32_f32 (arg0_float32x4_t);
}
示例#2
0
文件: base.hpp 项目: AdLantis/opencv
inline int32x4_t cv_vrndq_s32_f32(float32x4_t v)
{
    static int32x4_t v_sign = vdupq_n_s32(1 << 31),
        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));

    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v)));
    return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition)));
}
示例#3
0
inline v_int32x4 v_round(const v_float32x4& a)
{
    static const int32x4_t v_sign = vdupq_n_s32(1 << 31),
        v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f));

    int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val)));
    return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition))));
}
示例#4
0
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length)
{
  size_t i;
  unsigned char alpha;
  float32_t (*mat)[4] = transform->matrix;

  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;
  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;
  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;

  const uint8_t *otdata_r = &transform->output_table_r->data[0];
  const uint8_t *otdata_g = &transform->output_table_g->data[0];
  const uint8_t *otdata_b = &transform->output_table_b->data[0];

  const float32x4_t mat0 = vld1q_f32(mat[0]);
  const float32x4_t mat1 = vld1q_f32(mat[1]);
  const float32x4_t mat2 = vld1q_f32(mat[2]);

  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);
  const float32x4_t min   = vld1q_dup_f32(&zero);
  const float32x4_t scale = vld1q_dup_f32(&floatScale);

  float32x4_t vec_r, vec_g, vec_b;
  int32x4_t result;

  /* CYA */
  if (!length)
    return;

  for (i = 0; i < length; i++) {
    /* setup for transforming the pixel */
    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);
    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);
    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);
    alpha = *src++;

    /* gamma * matrix */
    vec_r = vmulq_f32(vec_r, mat0);
    vec_g = vmulq_f32(vec_g, mat1);
    vec_b = vmulq_f32(vec_b, mat2);

    /* crunch, crunch, crunch */
    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
    vec_r = vmaxq_f32(min, vec_r);
    vec_r = vminq_f32(max, vec_r);
    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));

    /* use calc'd indices to output RGB values */
    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];
    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];
    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];
    *dest++ = alpha;
  }
}
static inline uint8x16_t condense_float_rgbas(float32x4_t rgba0,
					      float32x4_t rgba1,  
					      float32x4_t rgba2,
					      float32x4_t rgba3)
{
  uint8x16_t retval = {0};  /* 16 bytes as 4 4-byte RGBAs  */
  int32x4_t i32pixels0, i32pixels1, i32pixels2, i32pixels3;
  int16x4_t i16pixels0, i16pixels1, i16pixels2, i16pixels3;
  int16x8_t i16pixels01, i16pixels23;
  uint8x8_t u8pixels0, u8pixels1;
  
  /* the choice of saturating conversions here will turn the elements  */
  /* of the rgbaN vectors into unsigned chars (0 - 255), so no max/min  */
  /* is required here.   */


  /* first float to int  */
  i32pixels0 = vcvtq_s32_f32(rgba0);
  i32pixels1 = vcvtq_s32_f32(rgba1);
  i32pixels2 = vcvtq_s32_f32(rgba2);
  i32pixels3 = vcvtq_s32_f32(rgba3);

  /* then int to short  */
  i16pixels0 = vqmovn_s32(i32pixels0);
  i16pixels1 = vqmovn_s32(i32pixels1);
  i16pixels2 = vqmovn_s32(i32pixels2);
  i16pixels3 = vqmovn_s32(i32pixels3);
  
  i16pixels01 = vcombine_s16(i16pixels0, i16pixels1);
  i16pixels23 = vcombine_s16(i16pixels2, i16pixels3);

  /* now short to unsigned int. saturation takes care of the boundary cases  */
  u8pixels0 = vqmovun_s16(i16pixels01);
  u8pixels1 = vqmovun_s16(i16pixels23);
  
  retval = vcombine_u8(u8pixels0, u8pixels1);

  return(retval);
}
/* Performs one rotation/translation */
static void 
neon_coord_4(
    float32x4_t a_4, 
    float32x4_t b_4,
    float32x4_t x_4, 
    float32x4_t y_4,
    float32x4_t pos_4f, 
    float32x4_t point5_4, 
    int * result)
{
    float32x4_t tmp1 = vmulq_f32(a_4, x_4);
    float32x4_t tmp2 = vmulq_f32(b_4, y_4);
    tmp2 = vaddq_f32(tmp1, tmp2);
    tmp2 = vaddq_f32(tmp2, pos_4f);
    tmp2 = vaddq_f32(tmp2, point5_4);
    int32x4_t c_4 = vcvtq_s32_f32(tmp2);
    vst1q_s32(result, c_4);
}
示例#7
0
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) {
  // a^b = exp2(b * log2(a))
  //   exp2(x) and log2(x) are calculated using polynomial approximations.
  float32x4_t log2_a, b_log2_a, a_exp_b;

  // Calculate log2(x), x = a.
  {
    // To calculate log2(x), we decompose x like this:
    //   x = y * 2^n
    //     n is an integer
    //     y is in the [1.0, 2.0) range
    //
    //   log2(x) = log2(y) + n
    //     n       can be evaluated by playing with float representation.
    //     log2(y) in a small range can be approximated, this code uses an order
    //             five polynomial approximation. The coefficients have been
    //             estimated with the Remez algorithm and the resulting
    //             polynomial has a maximum relative error of 0.00086%.

    // Compute n.
    //    This is done by masking the exponent, shifting it into the top bit of
    //    the mantissa, putting eight into the biased exponent (to shift/
    //    compensate the fact that the exponent has been shifted in the top/
    //    fractional part and finally getting rid of the implicit leading one
    //    from the mantissa by substracting it out.
    const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000);
    const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000);
    const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000);
    const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a),
                                       vec_float_exponent_mask);
    const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa);
    const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent);
    const float32x4_t n =
        vsubq_f32(vreinterpretq_f32_u32(n_0),
                  vreinterpretq_f32_u32(vec_implicit_leading_one));
    // Compute y.
    const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF);
    const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000);
    const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a),
                                          vec_mantissa_mask);
    const float32x4_t y =
        vreinterpretq_f32_u32(vorrq_u32(mantissa,
                                        vec_zero_biased_exponent_is_one));
    // Approximate log2(y) ~= (y - 1) * pol5(y).
    //    pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0
    const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f);
    const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f);
    const float32x4_t C3 = vdupq_n_f32(-1.2315303f);
    const float32x4_t C2 = vdupq_n_f32(2.5988452f);
    const float32x4_t C1 = vdupq_n_f32(-3.3241990f);
    const float32x4_t C0 = vdupq_n_f32(3.1157899f);
    float32x4_t pol5_y = C5;
    pol5_y = vmlaq_f32(C4, y, pol5_y);
    pol5_y = vmlaq_f32(C3, y, pol5_y);
    pol5_y = vmlaq_f32(C2, y, pol5_y);
    pol5_y = vmlaq_f32(C1, y, pol5_y);
    pol5_y = vmlaq_f32(C0, y, pol5_y);
    const float32x4_t y_minus_one =
        vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one));
    const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y);

    // Combine parts.
    log2_a = vaddq_f32(n, log2_y);
  }

  // b * log2(a)
  b_log2_a = vmulq_f32(b, log2_a);

  // Calculate exp2(x), x = b * log2(a).
  {
    // To calculate 2^x, we decompose x like this:
    //   x = n + y
    //     n is an integer, the value of x - 0.5 rounded down, therefore
    //     y is in the [0.5, 1.5) range
    //
    //   2^x = 2^n * 2^y
    //     2^n can be evaluated by playing with float representation.
    //     2^y in a small range can be approximated, this code uses an order two
    //         polynomial approximation. The coefficients have been estimated
    //         with the Remez algorithm and the resulting polynomial has a
    //         maximum relative error of 0.17%.
    // To avoid over/underflow, we reduce the range of input to ]-127, 129].
    const float32x4_t max_input = vdupq_n_f32(129.f);
    const float32x4_t min_input = vdupq_n_f32(-126.99999f);
    const float32x4_t x_min = vminq_f32(b_log2_a, max_input);
    const float32x4_t x_max = vmaxq_f32(x_min, min_input);
    // Compute n.
    const float32x4_t half = vdupq_n_f32(0.5f);
    const float32x4_t x_minus_half = vsubq_f32(x_max, half);
    const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half);

    // Compute 2^n.
    const int32x4_t float_exponent_bias = vdupq_n_s32(127);
    const int32x4_t two_n_exponent =
        vaddq_s32(x_minus_half_floor, float_exponent_bias);
    const float32x4_t two_n =
        vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift));
    // Compute y.
    const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor));

    // Approximate 2^y ~= C2 * y^2 + C1 * y + C0.
    const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f);
    const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f);
    const float32x4_t C0 = vdupq_n_f32(1.0017247f);
    float32x4_t exp2_y = C2;
    exp2_y = vmlaq_f32(C1, y, exp2_y);
    exp2_y = vmlaq_f32(C0, y, exp2_y);

    // Combine parts.
    a_exp_b = vmulq_f32(exp2_y, two_n);
  }

  return a_exp_b;
}
示例#8
0
inline v_int32x4 v_trunc(const v_float32x4& a)
{ return v_int32x4(vcvtq_s32_f32(a.val)); }
示例#9
0
inline v_int32x4 v_ceil(const v_float32x4& a)
{
    int32x4_t a1 = vcvtq_s32_f32(a.val);
    uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1));
    return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask)));
}
示例#10
0
inline v_int32x4 v_floor(const v_float32x4& a)
{
    int32x4_t a1 = vcvtq_s32_f32(a.val);
    uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val);
    return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask)));
}