Esempio n. 1
0
__m128 modulus_128(
  const __m128& x,
  const __m128& y) {

  __m128 z = x;
  __m128 n = _mm_round_ps((-z) / y, _MM_FROUND_TO_ZERO) + _mm_set1_ps(1.f);
  __m128 mask = _mm_cmplt_ps(z, _mm_set1_ps(0.f));
  z = applyMask_ps(mask, z + n * y, z);

  n = _mm_round_ps(z / y, _MM_FROUND_TO_ZERO);
  return z - n * y;
}
Esempio n. 2
0
__m128 exp_128(
  const __m128& x) {

  //! Clip the value
  __m128 y = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(88.3762626647949f)),
                                      _mm_set1_ps(-88.3762626647949f));

  //! Express exp(x) as exp(g + n * log(2))
  __m128 fx = y * _mm_set1_ps(1.44269504088896341) + _mm_set1_ps(0.5f);

  //! Floor
  const __m128 tmp = _mm_round_ps(fx, _MM_FROUND_TO_ZERO);

  //! If greater, substract 1
  const __m128 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), _mm_set1_ps(1.f));
  fx = tmp - mask;

  y -= fx * _mm_set1_ps(0.693359375 - 2.12194440e-4);
  const __m128 z = y * y;


  const __m128 t = (((((_mm_set1_ps(1.9875691500E-4)  * y +
                        _mm_set1_ps(1.3981999507E-3)) * y +
                        _mm_set1_ps(8.3334519073E-3)) * y +
                        _mm_set1_ps(4.1665795894E-2)) * y +
                        _mm_set1_ps(1.6666665459E-1)) * y +
                        _mm_set1_ps(5.0000001201E-1)) * z + y + _mm_set1_ps(1.f);

  //! Build 2^n
  const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(fx), _mm_set1_epi32(0x7f));

  //! Return the result
  return t * _mm_castsi128_ps(_mm_slli_epi32(emm0, 23));
}
Esempio n. 3
0
__m128 ori_to_bin_128(
  const __m128& ori,
  const int nbins) {

  //! For convenience
  const __m128 x2PI = _mm_set1_ps(2 * M_PI);
  const __m128 xbins = _mm_set1_ps(nbins);

  //! Get it positive
  const __m128 mask = _mm_cmplt_ps(ori, _mm_setzero_ps());

  //! Get the value
  const __m128 val = _mm_round_ps(applyMask_ps(mask, ori + x2PI, ori)
    / x2PI * xbins + _mm_set1_ps(0.5f), _MM_FROUND_TO_ZERO);

  //! Return the modulo of it
  return val - xbins * _mm_round_ps(val / xbins, _MM_FROUND_TO_ZERO);
}
Esempio n. 4
0
void
test4bit (void)
{
  d1 = _mm_round_pd (d2, k4);		  /* { dg-error "the last argument must be a 4-bit immediate" } */
  d1 = _mm_round_sd (d2, d3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_round_ps (a2, k4);		  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_round_ss (a2, a2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  a1 = _mm_blend_ps (a2, a3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  e1 = _mm256_blend_pd (e2, e3, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  e1 = _mm256_round_pd (e2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
  b1 = _mm256_round_ps (b2, k4);	  /* { dg-error "the last argument must be a 4-bit immediate" } */
}
Esempio n. 5
0
int dist_mic(const float* xyz, const int* pairs, const float* box_matrix,
             float* distance_out, float* displacement_out,
             const int n_frames, const int n_atoms, const int n_pairs) {
  /* Compute the distance/displacement between pairs of atoms in every frame
     of xyz following the minimum image convention in periodic boundary
     conditions.

    The computation follows scheme B.9 in Tukerman, M. "Statistical
    Mechanics: Theory and Molecular Simulation", 2010.

     Parameters
     ----------
     xyz : array, shape=(n_frames, n_atoms, 3)
         Cartesian coordinates of the atoms in every frame, in contiguous C order.
     pairs : array, shape=(n_pairs, 2)
         The specific pairs of atoms whose distance you want to compute. A 2d
         array of pairs, in C order.
     box_matrix : array, shape=(3,3)
          The box matrix for a single frame. All of the frames are assumed to
          use this box vector.
     distance_out : array, shape=(n_frames, n_pairs)
         Array where the distances between pairs will be stored, in contiguous
         C order.
     displacement_out : array, shaoe=(n_frames, n_pairs, 3), optional
         An optional return value: if you'd also like to save the displacement
         vectors between the pairs, you can pass a pointer here. If
         displacement_out is NULL, then this variable will not be saved back
         to memory.

     All of the arrays are assumed to be contiguous. This code will
     segfault if they're not.
  */
#ifndef __SSE4_1__
   _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST);
   int rounding_mode = _MM_GET_ROUNDING_MODE();
#endif

  int i, j;
  int store_displacement = displacement_out == NULL ? 0 : 1;
  int store_distance = distance_out == NULL ? 0 : 1;

  __m128 r1, r2, s12, r12, s, r12_2;
  __m128 hinv[3];
  __m128 h[3];

  for (i = 0; i < n_frames; i++) {
    // Store the columns of the box matrix in three float4s. This format
    // is fast for matrix * vector product. See, for example, this S.O. question:
    // http://stackoverflow.com/questions/14967969/efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-prod
    h[0] = _mm_setr_ps(box_matrix[0], box_matrix[3], box_matrix[6], 0.0f);
    h[1] = _mm_setr_ps(box_matrix[1], box_matrix[4], box_matrix[7], 0.0f);
    h[2] = _mm_setr_ps(box_matrix[2], box_matrix[5], box_matrix[8], 0.0f);
    // Calculate the inverse of the box matrix, and also store it in the same
    // format.
    inverse33(box_matrix, hinv+0, hinv+1, hinv+2);

    for (j = 0; j < n_pairs; j++) {
      // Load the two vectors whos distance we want to compute
      r1 = load_float3(xyz + 3*pairs[2*j + 0]);
      r2 = load_float3(xyz + 3*pairs[2*j + 1]);
      r12 =  _mm_sub_ps(r2, r1);

      // s12 = INVERSE(H) * r12
      s12 = _mm_add_ps(_mm_add_ps(
         _mm_mul_ps(hinv[0], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(0,0,0,0))),
         _mm_mul_ps(hinv[1], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(1,1,1,1)))),
         _mm_mul_ps(hinv[2], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(2,2,2,2))));

      // s12 = s12 - NEAREST_INTEGER(s12)
#ifdef __SSE4_1__
      s12 = _mm_sub_ps(s12, _mm_round_ps(s12, _MM_FROUND_TO_NEAREST_INT));
#else
      s12 = _mm_sub_ps(s12, _mm_cvtepi32_ps(_mm_cvtps_epi32(s12)));
#endif

      r12 = _mm_add_ps(_mm_add_ps(
          _mm_mul_ps(h[0], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(0,0,0,0))),
          _mm_mul_ps(h[1], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(1,1,1,1)))),
          _mm_mul_ps(h[2], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(2,2,2,2))));

      if (store_displacement) {
        // store the two lower entries (x,y) in memory
        _mm_storel_pi((__m64*)(displacement_out), r12);
        displacement_out += 2;
        // swap high-low and then store the z entry in the memory
        _mm_store_ss(displacement_out++, _mm_movehl_ps(r12, r12));
      }
      if (store_distance) {
        // out = sqrt(sum(r12**2))
        r12_2 = _mm_mul_ps(r12, r12);
        s = _mm_hadd_ps(r12_2, r12_2);
        s = _mm_hadd_ps(s, s);
        s = _mm_sqrt_ps(s);
        _mm_store_ss(distance_out++, s);
      }
    }
    // advance to the next frame
    xyz += n_atoms*3;
    box_matrix += 9;
  }

#ifndef __SSE4_1__
   _MM_SET_ROUNDING_MODE(rounding_mode);
#endif
  return 1;
}
Esempio n. 6
0
void Permutohedral::init ( const float* feature, int feature_size, int N )
{
	// Compute the lattice coordinates for each feature [there is going to be a lot of magic here
	N_ = N;
	d_ = feature_size;
	HashTable hash_table( d_, N_/**(d_+1)*/ );

	const int blocksize = sizeof(__m128) / sizeof(float);
	const __m128 invdplus1   = _mm_set1_ps( 1.0f / (d_+1) );
	const __m128 dplus1      = _mm_set1_ps( d_+1 );
	const __m128 Zero        = _mm_set1_ps( 0 );
	const __m128 One         = _mm_set1_ps( 1 );

	// Allocate the class memory
	if (offset_) delete [] offset_;
	offset_ = new int[ (d_+1)*(N_+16) ];
	memset( offset_, 0, (d_+1)*(N_+16)*sizeof(int) );

	if (barycentric_) delete [] barycentric_;
	barycentric_ = new float[ (d_+1)*(N_+16) ];
	memset( barycentric_, 0, (d_+1)*(N_+16)*sizeof(float) );

	// Allocate the local memory
	__m128 * scale_factor = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
	__m128 * f            = (__m128*) _mm_malloc( (d_  )*sizeof(__m128) , 16 );
	__m128 * elevated     = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
	__m128 * rem0         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 );
	__m128 * rank         = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 );
	float * barycentric = new float[(d_+2)*blocksize];
	short * canonical = new short[(d_+1)*(d_+1)];
	short * key = new short[d_+1];

	// Compute the canonical simplex
	for( int i=0; i<=d_; i++ ){
		for( int j=0; j<=d_-i; j++ )
			canonical[i*(d_+1)+j] = i;
		for( int j=d_-i+1; j<=d_; j++ )
			canonical[i*(d_+1)+j] = i - (d_+1);
	}

	// Expected standard deviation of our filter (p.6 in [Adams etal 2010])
	float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1);
	// Compute the diagonal part of E (p.5 in [Adams etal 2010])
	for( int i=0; i<d_; i++ )
		scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( float((i+2)*(i+1) ) * inv_std_dev) );

	// Setup the SSE rounding
#ifndef __SSE4_1__
	const unsigned int old_rounding = _mm_getcsr();
	_mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST );
#endif

	// Compute the simplex each feature lies in
	for( int k=0; k<N_; k+=blocksize ){
		// Load the feature from memory
		float * ff = (float*)f;
		for( int j=0; j<d_; j++ )
			for( int i=0; i<blocksize; i++ )
				ff[ j*blocksize + i ] = k+i < N_ ? feature[ (k+i)*d_+j ] : 0.0;

		// Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010])

		// sm contains the sum of 1..n of our faeture vector
		__m128 sm = Zero;
		for( int j=d_; j>0; j-- ){
			__m128 cf = f[j-1]*scale_factor[j-1];
			elevated[j] = sm - _mm_set1_ps(j)*cf;
			sm += cf;
		}
		elevated[0] = sm;

		// Find the closest 0-colored simplex through rounding
		__m128 sum = Zero;
		for( int i=0; i<=d_; i++ ){
			__m128 v = invdplus1 * elevated[i];
#ifdef __SSE4_1__
			v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT );
#else
			v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) );
#endif
			rem0[i] = v*dplus1;
			sum += v;
		}

		// Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values)
		for( int i=0; i<=d_; i++ )
			rank[i] = Zero;
		for( int i=0; i<d_; i++ ){
			__m128 di = elevated[i] - rem0[i];
			for( int j=i+1; j<=d_; j++ ){
				__m128 dj = elevated[j] - rem0[j];
				__m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) );
				rank[i] += c;
				rank[j] += One-c;
			}
		}

		// If the point doesn't lie on the plane (sum != 0) bring it back
		for( int i=0; i<=d_; i++ ){
			rank[i] += sum;
			__m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) );
			__m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) );
			rank[i] += add-sub;
			rem0[i] += add-sub;
		}

		// Compute the barycentric coordinates (p.10 in [Adams etal 2010])
		for( int i=0; i<(d_+2)*blocksize; i++ )
			barycentric[ i ] = 0;
		for( int i=0; i<=d_; i++ ){
			__m128 v = (elevated[i] - rem0[i])*invdplus1;

			// Didn't figure out how to SSE this
			float * fv = (float*)&v;
			float * frank = (float*)&rank[i];
			for( int j=0; j<blocksize; j++ ){
				int p = d_-frank[j];
				barycentric[j*(d_+2)+p  ] += fv[j];
				barycentric[j*(d_+2)+p+1] -= fv[j];
			}
		}

		// The rest is not SSE'd
		for( int j=0; j<blocksize; j++ ){
			// Wrap around
			barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1];

			float * frank = (float*)rank;
			float * frem0 = (float*)rem0;
			// Compute all vertices and their offset
			for( int remainder=0; remainder<=d_; remainder++ ){
				for( int i=0; i<d_; i++ ){
					key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ];
				}
				offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true );
				barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ];
			}
		}
	}
	_mm_free( scale_factor );
	_mm_free( f );
	_mm_free( elevated );
	_mm_free( rem0 );
	_mm_free( rank );
	delete [] barycentric;
	delete [] canonical;
	delete [] key;

	// Reset the SSE rounding
#ifndef __SSE4_1__
	_mm_setcsr( old_rounding );
#endif

	// This is normally fast enough so no SSE needed here
	// Find the Neighbors of each lattice point

	// Get the number of vertices in the lattice
	M_ = hash_table.size();

	// Create the neighborhood structure
	if(blur_neighbors_) delete[] blur_neighbors_;
	blur_neighbors_ = new Neighbors[ (d_+1)*M_ ];

	short * n1 = new short[d_+1];
	short * n2 = new short[d_+1];

	// For each of d+1 axes,
	for( int j = 0; j <= d_; j++ ){
		for( int i=0; i<M_; i++ ){
			const short * key = hash_table.getKey( i );
			for( int k=0; k<d_; k++ ){
				n1[k] = key[k] - 1;
				n2[k] = key[k] + 1;
			}
			n1[j] = key[j] + d_;
			n2[j] = key[j] - d_;

			blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 );
			blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 );
		}
	}
	delete[] n1;
	delete[] n2;
}
static __m128i cielabv (union hvrgbpix rgb)
{
    __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5);

    __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0);
    __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0);
    __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0);
    __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]);
    __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]);
    __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]);
    __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]);
    __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]);
    __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]);

    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h));
    xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v));
    xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v));

    xvxyz[0] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO)));
    xvxyz[1] = _mm_max_ps(_mm_set1_ps(0),
                          _mm_min_ps(_mm_set1_ps(0xffff),
                                     _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO)));
    __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]);
    __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]);
#ifdef __AVX__
    __m256 vlab,
           vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                    0,
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                    cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                    0},
           vxyz2 =  {0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                     0,
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                     cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]};

    vlab = _mm256_sub_ps(vxyz,vxyz2);
    vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0));
    vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0));
    vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64));
    vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO);
    __m256i vlabi = _mm256_cvtps_epi32(vlab);
    return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]);
#else
    __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrh,2)],
                           0};
    __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,1)],
                           cielab_cbrt[_mm_extract_epi32(loadaddrv,2)],
                           0};

    vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3)));
    vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0));
    vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0));
    vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64));
    vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO);

    vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)),
                       _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3)));
    vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0));
    vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0));
    vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64));
    vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO);

    return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh));
#endif
}
Esempio n. 8
0
void convert_to_rgb_fast()
{
    unsigned i,j,c;
    int row, col, k;
    ushort *img;
    float out_cam[3][4];
    double num, inverse[3][3];
    static const double xyzd50_srgb[3][3] =
    { { 0.436083, 0.385083, 0.143055 },
      { 0.222507, 0.716888, 0.060608 },
      { 0.013930, 0.097097, 0.714022 } };
    static const double rgb_rgb[3][3] =
    { { 1,0,0 }, { 0,1,0 }, { 0,0,1 } };
    static const double adobe_rgb[3][3] =
    { { 0.715146, 0.284856, 0.000000 },
      { 0.000000, 1.000000, 0.000000 },
      { 0.000000, 0.041166, 0.958839 } };
    static const double wide_rgb[3][3] =
    { { 0.593087, 0.404710, 0.002206 },
      { 0.095413, 0.843149, 0.061439 },
      { 0.011621, 0.069091, 0.919288 } };
    static const double prophoto_rgb[3][3] =
    { { 0.529317, 0.330092, 0.140588 },
      { 0.098368, 0.873465, 0.028169 },
      { 0.016879, 0.117663, 0.865457 } };
    static const double (*out_rgb[])[3] =
    { rgb_rgb, adobe_rgb, wide_rgb, prophoto_rgb, xyz_rgb };
    static const char *name[] =
    { "sRGB", "Adobe RGB (1998)", "WideGamut D65", "ProPhoto D65", "XYZ" };
    static const unsigned phead[] =
    { 1024, 0, 0x2100000, 0x6d6e7472, 0x52474220, 0x58595a20, 0, 0, 0,
      0x61637370, 0, 0, 0x6e6f6e65, 0, 0, 0, 0, 0xf6d6, 0x10000, 0xd32d };
    unsigned pbody[] =
    { 10, 0x63707274, 0, 36, /* cprt */
      0x64657363, 0, 40, /* desc */
      0x77747074, 0, 20, /* wtpt */
      0x626b7074, 0, 20, /* bkpt */
      0x72545243, 0, 14, /* rTRC */
      0x67545243, 0, 14, /* gTRC */
      0x62545243, 0, 14, /* bTRC */
      0x7258595a, 0, 20, /* rXYZ */
      0x6758595a, 0, 20, /* gXYZ */
      0x6258595a, 0, 20 }; /* bXYZ */
    static const unsigned pwhite[] = { 0xf351, 0x10000, 0x116cc };
    unsigned pcurve[] = { 0x63757276, 0, 1, 0x1000000 };

    gamma_curve (gamm[0], gamm[1], 0, 0);
    memcpy (out_cam, rgb_cam, sizeof out_cam);
    raw_color |= colors == 1 || document_mode ||
                 output_color < 1 || output_color > 5;
    if (!raw_color) {
        oprof = (unsigned *) calloc (phead[0], 1);
        merror (oprof, "convert_to_rgb()");
        memcpy (oprof, phead, sizeof phead);
        if (output_color == 5) oprof[4] = oprof[5];
        oprof[0] = 132 + 12*pbody[0];
        for (i=0; i < pbody[0]; i++) {
            oprof[oprof[0]/4] = i ? (i > 1 ? 0x58595a20 : 0x64657363) : 0x74657874;
            pbody[i*3+2] = oprof[0];
            oprof[0] += (pbody[i*3+3] + 3) & -4;
        }
        memcpy (oprof+32, pbody, sizeof pbody);
        oprof[pbody[5]/4+2] = strlen(name[output_color-1]) + 1;
        memcpy ((char *)oprof+pbody[8]+8, pwhite, sizeof pwhite);
        pcurve[3] = (short)(256/gamm[5]+0.5) << 16;
        for (i=4; i < 7; i++)
            memcpy ((char *)oprof+pbody[i*3+2], pcurve, sizeof pcurve);
        pseudoinverse ((double (*)[3])out_rgb[output_color-1], inverse, 3);
        for (i=0; i < 3; i++)
            for (j=0; j < 3; j++) {
                for (num = k=0; k < 3; k++)
                    num += xyzd50_srgb[i][k] * inverse[j][k];
                oprof[pbody[j*3+23]/4+i+2] = num * 0x10000 + 0.5;
            }
        for (i=0; i < phead[0]/4; i++)
            oprof[i] = htonl(oprof[i]);
        strcpy ((char *)oprof+pbody[2]+8, "auto-generated by dcraw");
        strcpy ((char *)oprof+pbody[5]+12, name[output_color-1]);
        for (i=0; i < 3; i++)
            for (j=0; j < colors; j++)
                for (out_cam[i][j] = k=0; k < 3; k++)
                    out_cam[i][j] += out_rgb[output_color-1][i][k] * rgb_cam[k][j];
    }
    if (verbose)
        fprintf (stderr, raw_color ? _("Building histograms...\n") :
                 _("Converting to %s colorspace...\n"), name[output_color-1]);

    memset (histogram, 0, sizeof histogram);
    if(!raw_color) {
        __m128 outcam0= {out_cam[0][0],out_cam[1][0],out_cam[2][0],0},
               outcam1= {out_cam[0][1],out_cam[1][1],out_cam[2][1],0},
               outcam2= {out_cam[0][2],out_cam[1][2],out_cam[2][2],0},
               outcam3= {out_cam[0][3],out_cam[1][3],out_cam[2][3],0};
        for (img=image[0]; img < image[width*height]; img+=4) {
            __m128 out0;
            __m128 vimg0 = {img[0],img[0],img[0],0},
                   vimg1 = {img[1],img[1],img[1],0},
                   vimg2 = {img[2],img[2],img[2],0},
                   vimg3 = {img[3],img[3],img[3],0};

//             out[0] = out_cam[0][0] * img[0]
//                     +out_cam[0][1] * img[1]
//                     +out_cam[0][2] * img[2]
//                     +out_cam[0][3] * img[3];
//             out[1] = out_cam[1][0] * img[0]
//                     +out_cam[1][1] * img[1]
//                     +out_cam[1][2] * img[2]
//                     +out_cam[1][3] * img[3];
//             out[2] = out_cam[2][0] * img[0]
//                     +out_cam[2][1] * img[1]
//                     +out_cam[2][2] * img[2]
//                     +out_cam[2][3] * img[3];
            out0 = _mm_add_ps(_mm_add_ps(
                     _mm_mul_ps(vimg0, outcam0),
                     _mm_mul_ps(vimg1, outcam1)
                   ), _mm_add_ps(
                     _mm_mul_ps(vimg2, outcam2),
                     _mm_mul_ps(vimg3, outcam3)
                   ));
            //clip
            out0 = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(out0, _MM_FROUND_TO_ZERO)));
            __m128i o = _mm_cvtps_epi32(out0);
            o = _mm_packus_epi32(o,_mm_setzero_si128());
            memcpy(img, &o, sizeof(short)*3);

            FORCC histogram[c][img[c] >> 3]++;
        }

    } else if (document_mode) {
Esempio n. 9
0
__m128 test_mm_round_ps(__m128 x) {
  // CHECK: define {{.*}} @test_mm_round_ps
  // CHECK: @llvm.x86.sse41.round.ps
  return _mm_round_ps(x, 2);
}
Esempio n. 10
0
__m128 test_mm_round_ps(__m128 x) {
  // CHECK-LABEL: test_mm_round_ps
  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps
  // CHECK-ASM: roundps $2, %xmm{{.*}}, %xmm{{.*}}
  return _mm_round_ps(x, 2);
}
Esempio n. 11
0
__m128 test_mm_round_ps(__m128 x) {
  // CHECK-LABEL: test_mm_round_ps
  // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4)
  return _mm_round_ps(x, 4);
}