void test_vcvtQs32_f32 (void) { int32x4_t out_int32x4_t; float32x4_t arg0_float32x4_t; out_int32x4_t = vcvtq_s32_f32 (arg0_float32x4_t); }
inline int32x4_t cv_vrndq_s32_f32(float32x4_t v) { static int32x4_t v_sign = vdupq_n_s32(1 << 31), v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(v))); return vcvtq_s32_f32(vaddq_f32(v, vreinterpretq_f32_s32(v_addition))); }
inline v_int32x4 v_round(const v_float32x4& a) { static const int32x4_t v_sign = vdupq_n_s32(1 << 31), v_05 = vreinterpretq_s32_f32(vdupq_n_f32(0.5f)); int32x4_t v_addition = vorrq_s32(v_05, vandq_s32(v_sign, vreinterpretq_s32_f32(a.val))); return v_int32x4(vcvtq_s32_f32(vaddq_f32(a.val, vreinterpretq_f32_s32(v_addition)))); }
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform, unsigned char *src, unsigned char *dest, size_t length) { size_t i; unsigned char alpha; float32_t (*mat)[4] = transform->matrix; const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r; const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g; const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b; const uint8_t *otdata_r = &transform->output_table_r->data[0]; const uint8_t *otdata_g = &transform->output_table_g->data[0]; const uint8_t *otdata_b = &transform->output_table_b->data[0]; const float32x4_t mat0 = vld1q_f32(mat[0]); const float32x4_t mat1 = vld1q_f32(mat[1]); const float32x4_t mat2 = vld1q_f32(mat[2]); const float32x4_t max = vld1q_dup_f32(&clampMaxValue); const float32x4_t min = vld1q_dup_f32(&zero); const float32x4_t scale = vld1q_dup_f32(&floatScale); float32x4_t vec_r, vec_g, vec_b; int32x4_t result; /* CYA */ if (!length) return; for (i = 0; i < length; i++) { /* setup for transforming the pixel */ vec_r = vld1q_dup_f32(&igtbl_r[*src++]); vec_g = vld1q_dup_f32(&igtbl_g[*src++]); vec_b = vld1q_dup_f32(&igtbl_b[*src++]); alpha = *src++; /* gamma * matrix */ vec_r = vmulq_f32(vec_r, mat0); vec_g = vmulq_f32(vec_g, mat1); vec_b = vmulq_f32(vec_b, mat2); /* crunch, crunch, crunch */ vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b)); vec_r = vmaxq_f32(min, vec_r); vec_r = vminq_f32(max, vec_r); result = vcvtq_s32_f32(vmulq_f32(vec_r, scale)); /* use calc'd indices to output RGB values */ *dest++ = otdata_r[vgetq_lane_s32(result, 0)]; *dest++ = otdata_g[vgetq_lane_s32(result, 1)]; *dest++ = otdata_b[vgetq_lane_s32(result, 2)]; *dest++ = alpha; } }
static inline uint8x16_t condense_float_rgbas(float32x4_t rgba0, float32x4_t rgba1, float32x4_t rgba2, float32x4_t rgba3) { uint8x16_t retval = {0}; /* 16 bytes as 4 4-byte RGBAs */ int32x4_t i32pixels0, i32pixels1, i32pixels2, i32pixels3; int16x4_t i16pixels0, i16pixels1, i16pixels2, i16pixels3; int16x8_t i16pixels01, i16pixels23; uint8x8_t u8pixels0, u8pixels1; /* the choice of saturating conversions here will turn the elements */ /* of the rgbaN vectors into unsigned chars (0 - 255), so no max/min */ /* is required here. */ /* first float to int */ i32pixels0 = vcvtq_s32_f32(rgba0); i32pixels1 = vcvtq_s32_f32(rgba1); i32pixels2 = vcvtq_s32_f32(rgba2); i32pixels3 = vcvtq_s32_f32(rgba3); /* then int to short */ i16pixels0 = vqmovn_s32(i32pixels0); i16pixels1 = vqmovn_s32(i32pixels1); i16pixels2 = vqmovn_s32(i32pixels2); i16pixels3 = vqmovn_s32(i32pixels3); i16pixels01 = vcombine_s16(i16pixels0, i16pixels1); i16pixels23 = vcombine_s16(i16pixels2, i16pixels3); /* now short to unsigned int. saturation takes care of the boundary cases */ u8pixels0 = vqmovun_s16(i16pixels01); u8pixels1 = vqmovun_s16(i16pixels23); retval = vcombine_u8(u8pixels0, u8pixels1); return(retval); }
/* Performs one rotation/translation */ static void neon_coord_4( float32x4_t a_4, float32x4_t b_4, float32x4_t x_4, float32x4_t y_4, float32x4_t pos_4f, float32x4_t point5_4, int * result) { float32x4_t tmp1 = vmulq_f32(a_4, x_4); float32x4_t tmp2 = vmulq_f32(b_4, y_4); tmp2 = vaddq_f32(tmp1, tmp2); tmp2 = vaddq_f32(tmp2, pos_4f); tmp2 = vaddq_f32(tmp2, point5_4); int32x4_t c_4 = vcvtq_s32_f32(tmp2); vst1q_s32(result, c_4); }
static float32x4_t vpowq_f32(float32x4_t a, float32x4_t b) { // a^b = exp2(b * log2(a)) // exp2(x) and log2(x) are calculated using polynomial approximations. float32x4_t log2_a, b_log2_a, a_exp_b; // Calculate log2(x), x = a. { // To calculate log2(x), we decompose x like this: // x = y * 2^n // n is an integer // y is in the [1.0, 2.0) range // // log2(x) = log2(y) + n // n can be evaluated by playing with float representation. // log2(y) in a small range can be approximated, this code uses an order // five polynomial approximation. The coefficients have been // estimated with the Remez algorithm and the resulting // polynomial has a maximum relative error of 0.00086%. // Compute n. // This is done by masking the exponent, shifting it into the top bit of // the mantissa, putting eight into the biased exponent (to shift/ // compensate the fact that the exponent has been shifted in the top/ // fractional part and finally getting rid of the implicit leading one // from the mantissa by substracting it out. const uint32x4_t vec_float_exponent_mask = vdupq_n_u32(0x7F800000); const uint32x4_t vec_eight_biased_exponent = vdupq_n_u32(0x43800000); const uint32x4_t vec_implicit_leading_one = vdupq_n_u32(0x43BF8000); const uint32x4_t two_n = vandq_u32(vreinterpretq_u32_f32(a), vec_float_exponent_mask); const uint32x4_t n_1 = vshrq_n_u32(two_n, kShiftExponentIntoTopMantissa); const uint32x4_t n_0 = vorrq_u32(n_1, vec_eight_biased_exponent); const float32x4_t n = vsubq_f32(vreinterpretq_f32_u32(n_0), vreinterpretq_f32_u32(vec_implicit_leading_one)); // Compute y. const uint32x4_t vec_mantissa_mask = vdupq_n_u32(0x007FFFFF); const uint32x4_t vec_zero_biased_exponent_is_one = vdupq_n_u32(0x3F800000); const uint32x4_t mantissa = vandq_u32(vreinterpretq_u32_f32(a), vec_mantissa_mask); const float32x4_t y = vreinterpretq_f32_u32(vorrq_u32(mantissa, vec_zero_biased_exponent_is_one)); // Approximate log2(y) ~= (y - 1) * pol5(y). // pol5(y) = C5 * y^5 + C4 * y^4 + C3 * y^3 + C2 * y^2 + C1 * y + C0 const float32x4_t C5 = vdupq_n_f32(-3.4436006e-2f); const float32x4_t C4 = vdupq_n_f32(3.1821337e-1f); const float32x4_t C3 = vdupq_n_f32(-1.2315303f); const float32x4_t C2 = vdupq_n_f32(2.5988452f); const float32x4_t C1 = vdupq_n_f32(-3.3241990f); const float32x4_t C0 = vdupq_n_f32(3.1157899f); float32x4_t pol5_y = C5; pol5_y = vmlaq_f32(C4, y, pol5_y); pol5_y = vmlaq_f32(C3, y, pol5_y); pol5_y = vmlaq_f32(C2, y, pol5_y); pol5_y = vmlaq_f32(C1, y, pol5_y); pol5_y = vmlaq_f32(C0, y, pol5_y); const float32x4_t y_minus_one = vsubq_f32(y, vreinterpretq_f32_u32(vec_zero_biased_exponent_is_one)); const float32x4_t log2_y = vmulq_f32(y_minus_one, pol5_y); // Combine parts. log2_a = vaddq_f32(n, log2_y); } // b * log2(a) b_log2_a = vmulq_f32(b, log2_a); // Calculate exp2(x), x = b * log2(a). { // To calculate 2^x, we decompose x like this: // x = n + y // n is an integer, the value of x - 0.5 rounded down, therefore // y is in the [0.5, 1.5) range // // 2^x = 2^n * 2^y // 2^n can be evaluated by playing with float representation. // 2^y in a small range can be approximated, this code uses an order two // polynomial approximation. The coefficients have been estimated // with the Remez algorithm and the resulting polynomial has a // maximum relative error of 0.17%. // To avoid over/underflow, we reduce the range of input to ]-127, 129]. const float32x4_t max_input = vdupq_n_f32(129.f); const float32x4_t min_input = vdupq_n_f32(-126.99999f); const float32x4_t x_min = vminq_f32(b_log2_a, max_input); const float32x4_t x_max = vmaxq_f32(x_min, min_input); // Compute n. const float32x4_t half = vdupq_n_f32(0.5f); const float32x4_t x_minus_half = vsubq_f32(x_max, half); const int32x4_t x_minus_half_floor = vcvtq_s32_f32(x_minus_half); // Compute 2^n. const int32x4_t float_exponent_bias = vdupq_n_s32(127); const int32x4_t two_n_exponent = vaddq_s32(x_minus_half_floor, float_exponent_bias); const float32x4_t two_n = vreinterpretq_f32_s32(vshlq_n_s32(two_n_exponent, kFloatExponentShift)); // Compute y. const float32x4_t y = vsubq_f32(x_max, vcvtq_f32_s32(x_minus_half_floor)); // Approximate 2^y ~= C2 * y^2 + C1 * y + C0. const float32x4_t C2 = vdupq_n_f32(3.3718944e-1f); const float32x4_t C1 = vdupq_n_f32(6.5763628e-1f); const float32x4_t C0 = vdupq_n_f32(1.0017247f); float32x4_t exp2_y = C2; exp2_y = vmlaq_f32(C1, y, exp2_y); exp2_y = vmlaq_f32(C0, y, exp2_y); // Combine parts. a_exp_b = vmulq_f32(exp2_y, two_n); } return a_exp_b; }
inline v_int32x4 v_trunc(const v_float32x4& a) { return v_int32x4(vcvtq_s32_f32(a.val)); }
inline v_int32x4 v_ceil(const v_float32x4& a) { int32x4_t a1 = vcvtq_s32_f32(a.val); uint32x4_t mask = vcgtq_f32(a.val, vcvtq_f32_s32(a1)); return v_int32x4(vsubq_s32(a1, vreinterpretq_s32_u32(mask))); }
inline v_int32x4 v_floor(const v_float32x4& a) { int32x4_t a1 = vcvtq_s32_f32(a.val); uint32x4_t mask = vcgtq_f32(vcvtq_f32_s32(a1), a.val); return v_int32x4(vaddq_s32(a1, vreinterpretq_s32_u32(mask))); }