Esempio n. 1
0
void qcms_transform_data_rgba_out_lut_neon(qcms_transform *transform,
                                           unsigned char *src,
                                           unsigned char *dest,
                                           size_t length)
{
  size_t i;
  unsigned char alpha;
  float32_t (*mat)[4] = transform->matrix;

  const float32_t *igtbl_r = (float32_t*)transform->input_gamma_table_r;
  const float32_t *igtbl_g = (float32_t*)transform->input_gamma_table_g;
  const float32_t *igtbl_b = (float32_t*)transform->input_gamma_table_b;

  const uint8_t *otdata_r = &transform->output_table_r->data[0];
  const uint8_t *otdata_g = &transform->output_table_g->data[0];
  const uint8_t *otdata_b = &transform->output_table_b->data[0];

  const float32x4_t mat0 = vld1q_f32(mat[0]);
  const float32x4_t mat1 = vld1q_f32(mat[1]);
  const float32x4_t mat2 = vld1q_f32(mat[2]);

  const float32x4_t max   = vld1q_dup_f32(&clampMaxValue);
  const float32x4_t min   = vld1q_dup_f32(&zero);
  const float32x4_t scale = vld1q_dup_f32(&floatScale);

  float32x4_t vec_r, vec_g, vec_b;
  int32x4_t result;

  /* CYA */
  if (!length)
    return;

  for (i = 0; i < length; i++) {
    /* setup for transforming the pixel */
    vec_r = vld1q_dup_f32(&igtbl_r[*src++]);
    vec_g = vld1q_dup_f32(&igtbl_g[*src++]);
    vec_b = vld1q_dup_f32(&igtbl_b[*src++]);
    alpha = *src++;

    /* gamma * matrix */
    vec_r = vmulq_f32(vec_r, mat0);
    vec_g = vmulq_f32(vec_g, mat1);
    vec_b = vmulq_f32(vec_b, mat2);

    /* crunch, crunch, crunch */
    vec_r = vaddq_f32(vec_r, vaddq_f32(vec_g, vec_b));
    vec_r = vmaxq_f32(min, vec_r);
    vec_r = vminq_f32(max, vec_r);
    result = vcvtq_s32_f32(vmulq_f32(vec_r, scale));

    /* use calc'd indices to output RGB values */
    *dest++ = otdata_r[vgetq_lane_s32(result, 0)];
    *dest++ = otdata_g[vgetq_lane_s32(result, 1)];
    *dest++ = otdata_b[vgetq_lane_s32(result, 2)];
    *dest++ = alpha;
  }
}
Esempio n. 2
0
/* s32x4 mm mul */
void mw_neon_mm_mul_s32x4(int * A, int Row, int T, int * B, int Col, int * C)
{
	int i, k, j;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{

		for (k = 0; k < Col; k+=1)
		{
			neon_c = vmovq_n_s32(0);

			for (j = 0; j < T; j+=4)
			{

				int j_T = j * T + i;
				int k_Row = k * Row;

				neon_a0 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a1 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a2 = vld1q_s32(A + j_T);
				j_T+=Row;
				neon_a3 = vld1q_s32(A + j_T);

				neon_b = vld1q_s32(B + k_Row + j);
				neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
				neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
				neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
				neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

				neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
				neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

				vst1q_lane_s32(C + k_Row + i, neon_c, 0);
				vst1q_lane_s32(C + k_Row + i + 1, neon_c, 1);
				vst1q_lane_s32(C + k_Row + i + 2, neon_c, 2);
				vst1q_lane_s32(C + k_Row + i + 3, neon_c, 3);

			}
		}
	}
}
Esempio n. 3
0
void test_vgetQ_lanes32 (void)
{
  int32_t out_int32_t;
  int32x4_t arg0_int32x4_t;

  out_int32_t = vgetq_lane_s32 (arg0_int32x4_t, 1);
}
Esempio n. 4
0
/* s32x4 mv mul */
void mw_neon_mv_mul_s32x4(int * A, int Row, int T, int * B, int * C)
{
	int i = 0;
	int k = 0;

	int32x4_t neon_b, neon_c;
	int32x4_t neon_a0, neon_a1, neon_a2, neon_a3;
	int32x4_t neon_b0, neon_b1, neon_b2, neon_b3;

	for (i = 0; i < Row; i+=4)
	{
		neon_c = vmovq_n_s32(0);

		for (k = 0; k < T; k+=4)
		{
			int j = k * T + i;

			neon_a0 = vld1q_s32(A + j);
			j+=Row;
			neon_a1 = vld1q_s32(A + j);
			j+=Row;
			neon_a2 = vld1q_s32(A + j);
			j+=Row;
			neon_a3 = vld1q_s32(A + j);

			neon_b = vld1q_s32(B + k);
			neon_b0 = vdupq_n_s32(vgetq_lane_s32(neon_b, 0));
			neon_b1 = vdupq_n_s32(vgetq_lane_s32(neon_b, 1));
			neon_b2 = vdupq_n_s32(vgetq_lane_s32(neon_b, 2));
			neon_b3 = vdupq_n_s32(vgetq_lane_s32(neon_b, 3));

			neon_c = vaddq_s32(vmulq_s32(neon_a0, neon_b0), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a1, neon_b1), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a2, neon_b2), neon_c);
			neon_c = vaddq_s32(vmulq_s32(neon_a3, neon_b3), neon_c);

		}

		vst1q_s32(C + i, neon_c);
	}
}
Esempio n. 5
0
int64_t test_vgetq_lane_s32(int32x4_t v1) {
  // CHECK: test_vgetq_lane_s32
  return vgetq_lane_s32(v1, 2);
  // CHECK: smov {{x[0-9]+}}, {{v[0-9]+}}.s[2]
}
Esempio n. 6
0
int32_t test_vgetq_lane_s32(int32x4_t a) {
  // CHECK-LABEL: test_vgetq_lane_s32:
  // CHECK-NEXT:  mov.s  w0, v0[3]
  // CHECK-NEXT:  ret
  return vgetq_lane_s32(a, 3);
}
Esempio n. 7
0
// CHECK-LABEL: define i32 @test_vgetq_lane_s32(<4 x i32> %a) #0 {
// CHECK:   [[TMP0:%.*]] = bitcast <4 x i32> %a to <16 x i8>
// CHECK:   [[TMP1:%.*]] = bitcast <16 x i8> [[TMP0]] to <4 x i32>
// CHECK:   [[VGETQ_LANE:%.*]] = extractelement <4 x i32> [[TMP1]], i32 3
// CHECK:   ret i32 [[VGETQ_LANE]]
int32_t test_vgetq_lane_s32(int32x4_t a) {
  return vgetq_lane_s32(a, 3);
}
Esempio n. 8
0
  inline ResultType operator()(Iterator1 a, Iterator2 b, size_t size) const
  {
    ResultType result = 0;
#if (defined __GNUC__ || defined __clang__) && defined USE_SSE
#ifdef __ARM_NEON__
    {
      uint32x4_t bits = vmovq_n_u32(0);
      for (size_t i = 0; i < size; i += 16) {
        uint8x16_t A_vec = vld1q_u8 (a + i);
        uint8x16_t B_vec = vld1q_u8 (b + i);
        uint8x16_t AxorB = veorq_u8 (A_vec, B_vec);
        uint8x16_t bitsSet = vcntq_u8 (AxorB);
        uint16x8_t bitSet8 = vpaddlq_u8 (bitsSet);
        uint32x4_t bitSet4 = vpaddlq_u16 (bitSet8);
        bits = vaddq_u32(bits, bitSet4);
      }
      uint64x2_t bitSet2 = vpaddlq_u32 (bits);
      result = vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),0);
      result += vgetq_lane_s32 (vreinterpretq_s32_u64(bitSet2),2);
    }
#else
    {
      //for portability just use unsigned long -- and use the __builtin_popcountll (see docs for __builtin_popcountll)
      typedef unsigned long long pop_t;
      const size_t modulo = size % sizeof(pop_t);
      const pop_t* a2 = reinterpret_cast<const pop_t*> (a);
      const pop_t* b2 = reinterpret_cast<const pop_t*> (b);
      const pop_t* a2_end = a2 + (size / sizeof(pop_t));

      for (; a2 != a2_end; ++a2, ++b2) result += __builtin_popcountll((*a2) ^ (*b2));

      if (modulo) {
        //in the case where size is not dividable by sizeof(pop_t)
        //need to mask off the bits at the end
        pop_t a_final = 0, b_final = 0;
        memcpy(&a_final, a2, modulo);
        memcpy(&b_final, b2, modulo);
        result += __builtin_popcountll(a_final ^ b_final);
      }
    }
#endif //NEON
    return result;
#endif
#ifdef PLATFORM_64_BIT
    if(size%64 == 0)
    {
      const uint64_t* pa = reinterpret_cast<const uint64_t*>(a);
      const uint64_t* pb = reinterpret_cast<const uint64_t*>(b);
      size /= (sizeof(uint64_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt64(*pa ^ *pb);
      }
    }
    else
    {
      const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
      const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
      size /= (sizeof(uint32_t)/sizeof(unsigned char));
      for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
        result += popcnt32(*pa ^ *pb);
      }
    }
#else
    const uint32_t* pa = reinterpret_cast<const uint32_t*>(a);
    const uint32_t* pb = reinterpret_cast<const uint32_t*>(b);
    size /= (sizeof(uint32_t)/sizeof(unsigned char));
    for(size_t i = 0; i < size; ++i, ++pa, ++pb ) {
      result += popcnt32(*pa ^ *pb);
    }
#endif
    return result;
  }