コード例 #1
0
ファイル: LibSSE.cpp プロジェクト: cpalmann/s2p
__m256 modulus_256(
  const __m256& x,
  const __m256& y) {

  __m256 z = x;
  __m256 n = _mm256_round_ps((-z) / y, _MM_FROUND_TO_ZERO) + _mm256_set1_ps(1.f);
  __m256 mask = _mm256_cmp_ps(z, _mm256_set1_ps(0.f), _CMP_LT_OS);
  z = applyMask256_ps(mask, z + n * y, z);

  n = _mm256_round_ps(z / y, _MM_FROUND_TO_ZERO);
  return z - n * y;
}
コード例 #2
0
ファイル: LibSSE.cpp プロジェクト: cpalmann/s2p
__m256 ori_to_bin_256(
  const __m256& ori,
  const int nbins) {

  //! For convenience
  const __m256 x2PI  = _mm256_set1_ps(2 * M_PI);
  const __m256 xbins = _mm256_set1_ps(nbins);

  //! Get it positive
  const __m256 mask = _mm256_cmp_ps(ori, _mm256_setzero_ps(), _CMP_LT_OS);

  //! Get the value
  const __m256 val = _mm256_round_ps(applyMask256_ps(mask, ori + x2PI, ori)
    / x2PI * xbins + _mm256_set1_ps(0.5f), _MM_FROUND_TO_ZERO);

  //! Return the modulo of it
  return val - xbins * _mm256_round_ps(val / xbins, _MM_FROUND_TO_ZERO);
}
コード例 #3
0
ファイル: testimm-3.c プロジェクト: 0day-ci/gcc
void
test4bit (void)
{
  d1 = _mm_round_pd (d2, k4);		  /* { dg-error "the last argument must be a 4-bit immediate" } */
  d1 = _mm_round_sd (d2, d3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_round_ps (a2, k4);		  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_round_ss (a2, a2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_blend_ps (a2, a3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  e1 = _mm256_blend_pd (e2, e3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  e1 = _mm256_round_pd (e2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  b1 = _mm256_round_ps (b2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
}
コード例 #4
0
ファイル: LibSSE.cpp プロジェクト: cpalmann/s2p
__m256 exp_256(
  const __m256& x) {

  //! Clip the value
  __m256 y = _mm256_max_ps(_mm256_min_ps(x, _mm256_set1_ps(88.3762626647949f)),
                                            _mm256_set1_ps(-88.3762626647949f));

  //! Express exp(x) as exp(g + n * log(2))
  __m256 fx = y * _mm256_set1_ps(1.44269504088896341) + _mm256_set1_ps(0.5f);

  //! Floor
  const __m256 tmp = _mm256_round_ps(fx, _MM_FROUND_TO_ZERO);

  //! If greater, substract 1
  const __m256 mask = _mm256_and_ps(_mm256_cmp_ps(tmp, fx, _CMP_GT_OS),
                                    _mm256_set1_ps(1.f));
  fx = tmp - mask;

  y -= fx * _mm256_set1_ps(0.693359375 - 2.12194440e-4);
  const __m256 z = y * y;


  const __m256 t = (((((_mm256_set1_ps(1.9875691500E-4)  * y +
                        _mm256_set1_ps(1.3981999507E-3)) * y +
                        _mm256_set1_ps(8.3334519073E-3)) * y +
                        _mm256_set1_ps(4.1665795894E-2)) * y +
                        _mm256_set1_ps(1.6666665459E-1)) * y +
                        _mm256_set1_ps(5.0000001201E-1)) * z + y +
                        _mm256_set1_ps(1.f);

  //! Build 2^n (split it into two SSE array, since AVX2 equivalent functions
  //! aren't available.
  const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_castps256_ps128(fx)), _mm_set1_epi32(0x7f));
  const __m128i emm1 = _mm_add_epi32(_mm_cvttps_epi32(_mm256_extractf128_ps(fx, 1)), _mm_set1_epi32(0x7f));

  fx = _mm256_castps128_ps256(_mm_castsi128_ps(_mm_slli_epi32(emm0, 23)));
  fx = _mm256_insertf128_ps(fx, _mm_castsi128_ps(_mm_slli_epi32(emm1, 23)), 1);

  //! Return the result
  return t * fx;
}
コード例 #5
0
ファイル: dsp.c プロジェクト: sveinungf/INF5063-Codec63-x86
// Rounding half away from zero (equivalent to round() from math.h)
// __m256 contains 8 floats, but to simplify the examples, only 4 will be shown
// Initial values to be used in the examples:
// [-12.49  -0.5   1.5   3.7]
static __m256 c63_mm256_roundhalfawayfromzero_ps(const __m256 initial)
{
	const __m256 sign_mask = _mm256_set1_ps(-0.f);
	const __m256 one_half = _mm256_set1_ps(0.5f);
	const __m256 all_zeros = _mm256_setzero_ps();
	const __m256 pos_one = _mm256_set1_ps(1.f);
	const __m256 neg_one = _mm256_set1_ps(-1.f);

	// Creates a mask based on the sign of the floats, true for negative floats
	// Example: [true   true   false   false]
	__m256 less_than_zero = _mm256_cmp_ps(initial, all_zeros, _CMP_LT_OQ);

	// Returns the integer part of the floats
	// Example: [-12.0   -0.0   1.0   3.0]
	__m256 without_fraction = _mm256_round_ps(initial, (_MM_FROUND_TO_ZERO | _MM_FROUND_NO_EXC));

	// Returns the fraction part of the floats
	// Example: [-0.49   -0.5   0.5   0.7]
	__m256 fraction = _mm256_sub_ps(initial, without_fraction);

	// Absolute values of the fractions
	// Example: [0.49   0.5   0.5   0.7]
	__m256 fraction_abs = _mm256_andnot_ps(sign_mask, fraction);

	// Compares abs(fractions) to 0.5, true if lower
	// Example: [true   false   false   false]
	__m256 less_than_one_half = _mm256_cmp_ps(fraction_abs, one_half, _CMP_LT_OQ);

	// Blends 1.0 and -1.0 depending on the initial sign of the floats
	// Example: [-1.0   -1.0   1.0   1.0]
	__m256 signed_ones = _mm256_blendv_ps(pos_one, neg_one, less_than_zero);

	// Blends the previous result with zeros depending on the fractions that are lower than 0.5
	// Example: [0.0   -1.0   1.0   1.0]
	__m256 to_add = _mm256_blendv_ps(signed_ones, all_zeros, less_than_one_half);

	// Adds the previous result to the floats without fractions
	// Example: [-12.0   -1.0   2.0   4.0]
	return _mm256_add_ps(without_fraction, to_add);
}
コード例 #6
0
ファイル: avx_vectorization.hpp プロジェクト: wichtounet/etl
 /*!
  * \brief Round up each values of the vector and return them
  */
 ETL_STATIC_INLINE(avx_simd_float) round_up(avx_simd_float x) {
     return _mm256_round_ps(x.value, (_MM_FROUND_TO_POS_INF | _MM_FROUND_NO_EXC));
 }
コード例 #7
0
static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}
コード例 #8
0
ファイル: simd_dct.c プロジェクト: LucasSloan/JPEG
void run_dct(int width, int height, float *quant, float *input, int32_t *output)
{
  float acosvals[8][8];

  /* Calculating cosines is expensive, and there
   * are only 64 cosines that need to be calculated
   * so precompute them and cache. */
  for (int i = 0; i < 8; i++)
  {
    for (int j = 0; j < 8; j++)
    {
      if (j == 0)
      {
        acosvals[i][j] = sqrt(1.0 / 8.0) * cos(PI / 8.0 * (i + 0.5d) * j);
      }
      else
      {
        acosvals[i][j] = 0.5 * cos(PI / 8.0 * (i + 0.5d) * j);
      }
    }
  }

/* Separate the parallel from the for, so each processor gets its
   * own copy of the buffers and variables. */
#pragma omp parallel
  {
    float avload[8] = {0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5};
    avload[0] = sqrt(1.0 / 8.0);
    __m256 row0, row1, row2, row3, row4, row5, row6, row7;
    __m256 loaderlow, loaderhigh;
    __m256 temp;
    __m256 minus128 = _mm256_set1_ps(-128.0);
    __m256 avxcosloader, avxcos;
    float avxcosmover;
    __m256i integer;

/* The DCT breaks the image into 8 by 8 blocks and then
   * transforms them into color frequencies. */
#pragma omp for
    for (int brow = 0; brow < height / 8; brow++)
    {
      for (int bcol = 0; bcol < width / 8; bcol++)
      {
        int head_pointer = bcol * 8 + brow * 8 * width;
        row0 = _mm256_setzero_ps();
        row1 = _mm256_setzero_ps();
        row2 = _mm256_setzero_ps();
        row3 = _mm256_setzero_ps();
        row4 = _mm256_setzero_ps();
        row5 = _mm256_setzero_ps();
        row6 = _mm256_setzero_ps();
        row7 = _mm256_setzero_ps();

        /* This pair of loops uses AVX instuctions to add the frequency
       * component from each pixel to all of the buckets at once.  Allows
       * us to do the DCT on a block in 64 iterations of a loop rather
       * than 64 iterations of 64 iterations of a loop (all 64 pixels affect
       * all 64 frequencies) */
        for (int x = 0; x < 8; x++)
        {
          for (int y = 0; y < 4; y++)
          {
            loaderlow = _mm256_broadcast_ss(&input[head_pointer + x + (y * width)]);
            loaderlow = _mm256_add_ps(loaderlow, minus128);
            loaderhigh = _mm256_broadcast_ss(&input[head_pointer + x + ((7 - y) * width)]);
            loaderhigh = _mm256_add_ps(loaderhigh, minus128);

            avxcos = _mm256_loadu_ps(&acosvals[x][0]);
            loaderlow = _mm256_mul_ps(loaderlow, avxcos);
            loaderhigh = _mm256_mul_ps(loaderhigh, avxcos);

            avxcosloader = _mm256_loadu_ps(&acosvals[y][0]);

            avxcosmover = avxcosloader[0];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row0 = _mm256_add_ps(row0, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row0 = _mm256_add_ps(row0, temp);

            avxcosmover = avxcosloader[1];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row1 = _mm256_add_ps(row1, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row1 = _mm256_sub_ps(row1, temp);

            avxcosmover = avxcosloader[2];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row2 = _mm256_add_ps(row2, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row2 = _mm256_add_ps(row2, temp);

            avxcosmover = avxcosloader[3];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row3 = _mm256_add_ps(row3, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row3 = _mm256_sub_ps(row3, temp);

            avxcosmover = avxcosloader[4];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row4 = _mm256_add_ps(row4, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row4 = _mm256_add_ps(row4, temp);

            avxcosmover = avxcosloader[5];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row5 = _mm256_add_ps(row5, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row5 = _mm256_sub_ps(row5, temp);

            avxcosmover = avxcosloader[6];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row6 = _mm256_add_ps(row6, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row6 = _mm256_add_ps(row6, temp);

            avxcosmover = avxcosloader[7];
            avxcos = _mm256_set1_ps(avxcosmover);
            temp = _mm256_mul_ps(loaderlow, avxcos);
            row7 = _mm256_add_ps(row7, temp);
            temp = _mm256_mul_ps(loaderhigh, avxcos);
            row7 = _mm256_sub_ps(row7, temp);
          }
        }

        /* Each frequency stored as a float needs to be divided by
       * the quantization value, then rounded to the nearest integer.
       * Also changes the order of the values from pixel order to
       * each 8 by 8 block stored one after another. */
        temp = _mm256_loadu_ps(&quant[0]);
        row0 = _mm256_div_ps(row0, temp);
        row0 = _mm256_round_ps(row0, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row0);
        _mm256_storeu_si256(output + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[8]);
        row1 = _mm256_div_ps(row1, temp);
        row1 = _mm256_round_ps(row1, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row1);
        _mm256_storeu_si256(output + 8 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[16]);
        row2 = _mm256_div_ps(row2, temp);
        row2 = _mm256_round_ps(row2, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row2);
        _mm256_storeu_si256(output + 16 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[24]);
        row3 = _mm256_div_ps(row3, temp);
        row3 = _mm256_round_ps(row3, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row3);
        _mm256_storeu_si256(output + 24 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[32]);
        row4 = _mm256_div_ps(row4, temp);
        row4 = _mm256_round_ps(row4, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row4);
        _mm256_storeu_si256(output + 32 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[40]);
        row5 = _mm256_div_ps(row5, temp);
        row5 = _mm256_round_ps(row5, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row5);
        _mm256_storeu_si256(output + 40 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[48]);
        row6 = _mm256_div_ps(row6, temp);
        row6 = _mm256_round_ps(row6, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row6);
        _mm256_storeu_si256(output + 48 + (bcol + brow * (width / 8)) * 64, integer);

        temp = _mm256_loadu_ps(&quant[56]);
        row7 = _mm256_div_ps(row7, temp);
        row7 = _mm256_round_ps(row7, _MM_FROUND_TO_NEAREST_INT);
        integer = _mm256_cvttps_epi32(row7);
        _mm256_storeu_si256(output + 56 + (bcol + brow * (width / 8)) * 64, integer);
      }
    }
  }
}