__m128 modulus_128( const __m128& x, const __m128& y) { __m128 z = x; __m128 n = _mm_round_ps((-z) / y, _MM_FROUND_TO_ZERO) + _mm_set1_ps(1.f); __m128 mask = _mm_cmplt_ps(z, _mm_set1_ps(0.f)); z = applyMask_ps(mask, z + n * y, z); n = _mm_round_ps(z / y, _MM_FROUND_TO_ZERO); return z - n * y; }
__m128 exp_128( const __m128& x) { //! Clip the value __m128 y = _mm_max_ps(_mm_min_ps(x, _mm_set1_ps(88.3762626647949f)), _mm_set1_ps(-88.3762626647949f)); //! Express exp(x) as exp(g + n * log(2)) __m128 fx = y * _mm_set1_ps(1.44269504088896341) + _mm_set1_ps(0.5f); //! Floor const __m128 tmp = _mm_round_ps(fx, _MM_FROUND_TO_ZERO); //! If greater, substract 1 const __m128 mask = _mm_and_ps(_mm_cmpgt_ps(tmp, fx), _mm_set1_ps(1.f)); fx = tmp - mask; y -= fx * _mm_set1_ps(0.693359375 - 2.12194440e-4); const __m128 z = y * y; const __m128 t = (((((_mm_set1_ps(1.9875691500E-4) * y + _mm_set1_ps(1.3981999507E-3)) * y + _mm_set1_ps(8.3334519073E-3)) * y + _mm_set1_ps(4.1665795894E-2)) * y + _mm_set1_ps(1.6666665459E-1)) * y + _mm_set1_ps(5.0000001201E-1)) * z + y + _mm_set1_ps(1.f); //! Build 2^n const __m128i emm0 = _mm_add_epi32(_mm_cvttps_epi32(fx), _mm_set1_epi32(0x7f)); //! Return the result return t * _mm_castsi128_ps(_mm_slli_epi32(emm0, 23)); }
__m128 ori_to_bin_128( const __m128& ori, const int nbins) { //! For convenience const __m128 x2PI = _mm_set1_ps(2 * M_PI); const __m128 xbins = _mm_set1_ps(nbins); //! Get it positive const __m128 mask = _mm_cmplt_ps(ori, _mm_setzero_ps()); //! Get the value const __m128 val = _mm_round_ps(applyMask_ps(mask, ori + x2PI, ori) / x2PI * xbins + _mm_set1_ps(0.5f), _MM_FROUND_TO_ZERO); //! Return the modulo of it return val - xbins * _mm_round_ps(val / xbins, _MM_FROUND_TO_ZERO); }
void test4bit (void) { d1 = _mm_round_pd (d2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ d1 = _mm_round_sd (d2, d3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_round_ps (a2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_round_ss (a2, a2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ a1 = _mm_blend_ps (a2, a3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ e1 = _mm256_blend_pd (e2, e3, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ e1 = _mm256_round_pd (e2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ b1 = _mm256_round_ps (b2, k4); /* { dg-error "the last argument must be a 4-bit immediate" } */ }
int dist_mic(const float* xyz, const int* pairs, const float* box_matrix, float* distance_out, float* displacement_out, const int n_frames, const int n_atoms, const int n_pairs) { /* Compute the distance/displacement between pairs of atoms in every frame of xyz following the minimum image convention in periodic boundary conditions. The computation follows scheme B.9 in Tukerman, M. "Statistical Mechanics: Theory and Molecular Simulation", 2010. Parameters ---------- xyz : array, shape=(n_frames, n_atoms, 3) Cartesian coordinates of the atoms in every frame, in contiguous C order. pairs : array, shape=(n_pairs, 2) The specific pairs of atoms whose distance you want to compute. A 2d array of pairs, in C order. box_matrix : array, shape=(3,3) The box matrix for a single frame. All of the frames are assumed to use this box vector. distance_out : array, shape=(n_frames, n_pairs) Array where the distances between pairs will be stored, in contiguous C order. displacement_out : array, shaoe=(n_frames, n_pairs, 3), optional An optional return value: if you'd also like to save the displacement vectors between the pairs, you can pass a pointer here. If displacement_out is NULL, then this variable will not be saved back to memory. All of the arrays are assumed to be contiguous. This code will segfault if they're not. */ #ifndef __SSE4_1__ _MM_SET_ROUNDING_MODE(_MM_ROUND_NEAREST); int rounding_mode = _MM_GET_ROUNDING_MODE(); #endif int i, j; int store_displacement = displacement_out == NULL ? 0 : 1; int store_distance = distance_out == NULL ? 0 : 1; __m128 r1, r2, s12, r12, s, r12_2; __m128 hinv[3]; __m128 h[3]; for (i = 0; i < n_frames; i++) { // Store the columns of the box matrix in three float4s. This format // is fast for matrix * vector product. See, for example, this S.O. question: // http://stackoverflow.com/questions/14967969/efficient-4x4-matrix-vector-multiplication-with-sse-horizontal-add-and-dot-prod h[0] = _mm_setr_ps(box_matrix[0], box_matrix[3], box_matrix[6], 0.0f); h[1] = _mm_setr_ps(box_matrix[1], box_matrix[4], box_matrix[7], 0.0f); h[2] = _mm_setr_ps(box_matrix[2], box_matrix[5], box_matrix[8], 0.0f); // Calculate the inverse of the box matrix, and also store it in the same // format. inverse33(box_matrix, hinv+0, hinv+1, hinv+2); for (j = 0; j < n_pairs; j++) { // Load the two vectors whos distance we want to compute r1 = load_float3(xyz + 3*pairs[2*j + 0]); r2 = load_float3(xyz + 3*pairs[2*j + 1]); r12 = _mm_sub_ps(r2, r1); // s12 = INVERSE(H) * r12 s12 = _mm_add_ps(_mm_add_ps( _mm_mul_ps(hinv[0], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(hinv[1], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(hinv[2], _mm_shuffle_ps(r12, r12, _MM_SHUFFLE(2,2,2,2)))); // s12 = s12 - NEAREST_INTEGER(s12) #ifdef __SSE4_1__ s12 = _mm_sub_ps(s12, _mm_round_ps(s12, _MM_FROUND_TO_NEAREST_INT)); #else s12 = _mm_sub_ps(s12, _mm_cvtepi32_ps(_mm_cvtps_epi32(s12))); #endif r12 = _mm_add_ps(_mm_add_ps( _mm_mul_ps(h[0], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(0,0,0,0))), _mm_mul_ps(h[1], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(1,1,1,1)))), _mm_mul_ps(h[2], _mm_shuffle_ps(s12, s12, _MM_SHUFFLE(2,2,2,2)))); if (store_displacement) { // store the two lower entries (x,y) in memory _mm_storel_pi((__m64*)(displacement_out), r12); displacement_out += 2; // swap high-low and then store the z entry in the memory _mm_store_ss(displacement_out++, _mm_movehl_ps(r12, r12)); } if (store_distance) { // out = sqrt(sum(r12**2)) r12_2 = _mm_mul_ps(r12, r12); s = _mm_hadd_ps(r12_2, r12_2); s = _mm_hadd_ps(s, s); s = _mm_sqrt_ps(s); _mm_store_ss(distance_out++, s); } } // advance to the next frame xyz += n_atoms*3; box_matrix += 9; } #ifndef __SSE4_1__ _MM_SET_ROUNDING_MODE(rounding_mode); #endif return 1; }
void Permutohedral::init ( const float* feature, int feature_size, int N ) { // Compute the lattice coordinates for each feature [there is going to be a lot of magic here N_ = N; d_ = feature_size; HashTable hash_table( d_, N_/**(d_+1)*/ ); const int blocksize = sizeof(__m128) / sizeof(float); const __m128 invdplus1 = _mm_set1_ps( 1.0f / (d_+1) ); const __m128 dplus1 = _mm_set1_ps( d_+1 ); const __m128 Zero = _mm_set1_ps( 0 ); const __m128 One = _mm_set1_ps( 1 ); // Allocate the class memory if (offset_) delete [] offset_; offset_ = new int[ (d_+1)*(N_+16) ]; memset( offset_, 0, (d_+1)*(N_+16)*sizeof(int) ); if (barycentric_) delete [] barycentric_; barycentric_ = new float[ (d_+1)*(N_+16) ]; memset( barycentric_, 0, (d_+1)*(N_+16)*sizeof(float) ); // Allocate the local memory __m128 * scale_factor = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * f = (__m128*) _mm_malloc( (d_ )*sizeof(__m128) , 16 ); __m128 * elevated = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rem0 = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128) , 16 ); __m128 * rank = (__m128*) _mm_malloc( (d_+1)*sizeof(__m128), 16 ); float * barycentric = new float[(d_+2)*blocksize]; short * canonical = new short[(d_+1)*(d_+1)]; short * key = new short[d_+1]; // Compute the canonical simplex for( int i=0; i<=d_; i++ ){ for( int j=0; j<=d_-i; j++ ) canonical[i*(d_+1)+j] = i; for( int j=d_-i+1; j<=d_; j++ ) canonical[i*(d_+1)+j] = i - (d_+1); } // Expected standard deviation of our filter (p.6 in [Adams etal 2010]) float inv_std_dev = sqrt(2.0 / 3.0)*(d_+1); // Compute the diagonal part of E (p.5 in [Adams etal 2010]) for( int i=0; i<d_; i++ ) scale_factor[i] = _mm_set1_ps( 1.0 / sqrt( float((i+2)*(i+1) ) * inv_std_dev) ); // Setup the SSE rounding #ifndef __SSE4_1__ const unsigned int old_rounding = _mm_getcsr(); _mm_setcsr( (old_rounding&~_MM_ROUND_MASK) | _MM_ROUND_NEAREST ); #endif // Compute the simplex each feature lies in for( int k=0; k<N_; k+=blocksize ){ // Load the feature from memory float * ff = (float*)f; for( int j=0; j<d_; j++ ) for( int i=0; i<blocksize; i++ ) ff[ j*blocksize + i ] = k+i < N_ ? feature[ (k+i)*d_+j ] : 0.0; // Elevate the feature ( y = Ep, see p.5 in [Adams etal 2010]) // sm contains the sum of 1..n of our faeture vector __m128 sm = Zero; for( int j=d_; j>0; j-- ){ __m128 cf = f[j-1]*scale_factor[j-1]; elevated[j] = sm - _mm_set1_ps(j)*cf; sm += cf; } elevated[0] = sm; // Find the closest 0-colored simplex through rounding __m128 sum = Zero; for( int i=0; i<=d_; i++ ){ __m128 v = invdplus1 * elevated[i]; #ifdef __SSE4_1__ v = _mm_round_ps( v, _MM_FROUND_TO_NEAREST_INT ); #else v = _mm_cvtepi32_ps( _mm_cvtps_epi32( v ) ); #endif rem0[i] = v*dplus1; sum += v; } // Find the simplex we are in and store it in rank (where rank describes what position coorinate i has in the sorted order of the features values) for( int i=0; i<=d_; i++ ) rank[i] = Zero; for( int i=0; i<d_; i++ ){ __m128 di = elevated[i] - rem0[i]; for( int j=i+1; j<=d_; j++ ){ __m128 dj = elevated[j] - rem0[j]; __m128 c = _mm_and_ps( One, _mm_cmplt_ps( di, dj ) ); rank[i] += c; rank[j] += One-c; } } // If the point doesn't lie on the plane (sum != 0) bring it back for( int i=0; i<=d_; i++ ){ rank[i] += sum; __m128 add = _mm_and_ps( dplus1, _mm_cmplt_ps( rank[i], Zero ) ); __m128 sub = _mm_and_ps( dplus1, _mm_cmpge_ps( rank[i], dplus1 ) ); rank[i] += add-sub; rem0[i] += add-sub; } // Compute the barycentric coordinates (p.10 in [Adams etal 2010]) for( int i=0; i<(d_+2)*blocksize; i++ ) barycentric[ i ] = 0; for( int i=0; i<=d_; i++ ){ __m128 v = (elevated[i] - rem0[i])*invdplus1; // Didn't figure out how to SSE this float * fv = (float*)&v; float * frank = (float*)&rank[i]; for( int j=0; j<blocksize; j++ ){ int p = d_-frank[j]; barycentric[j*(d_+2)+p ] += fv[j]; barycentric[j*(d_+2)+p+1] -= fv[j]; } } // The rest is not SSE'd for( int j=0; j<blocksize; j++ ){ // Wrap around barycentric[j*(d_+2)+0]+= 1 + barycentric[j*(d_+2)+d_+1]; float * frank = (float*)rank; float * frem0 = (float*)rem0; // Compute all vertices and their offset for( int remainder=0; remainder<=d_; remainder++ ){ for( int i=0; i<d_; i++ ){ key[i] = frem0[i*blocksize+j] + canonical[ remainder*(d_+1) + (int)frank[i*blocksize+j] ]; } offset_[ (j+k)*(d_+1)+remainder ] = hash_table.find( key, true ); barycentric_[ (j+k)*(d_+1)+remainder ] = barycentric[ j*(d_+2)+remainder ]; } } } _mm_free( scale_factor ); _mm_free( f ); _mm_free( elevated ); _mm_free( rem0 ); _mm_free( rank ); delete [] barycentric; delete [] canonical; delete [] key; // Reset the SSE rounding #ifndef __SSE4_1__ _mm_setcsr( old_rounding ); #endif // This is normally fast enough so no SSE needed here // Find the Neighbors of each lattice point // Get the number of vertices in the lattice M_ = hash_table.size(); // Create the neighborhood structure if(blur_neighbors_) delete[] blur_neighbors_; blur_neighbors_ = new Neighbors[ (d_+1)*M_ ]; short * n1 = new short[d_+1]; short * n2 = new short[d_+1]; // For each of d+1 axes, for( int j = 0; j <= d_; j++ ){ for( int i=0; i<M_; i++ ){ const short * key = hash_table.getKey( i ); for( int k=0; k<d_; k++ ){ n1[k] = key[k] - 1; n2[k] = key[k] + 1; } n1[j] = key[j] + d_; n2[j] = key[j] - d_; blur_neighbors_[j*M_+i].n1 = hash_table.find( n1 ); blur_neighbors_[j*M_+i].n2 = hash_table.find( n2 ); } } delete[] n1; delete[] n2; }
static __m128i cielabv (union hvrgbpix rgb) { __m128 xvxyz[2] = {_mm_set1_ps(0.5),_mm_set1_ps(0.5) }; //,0.5,0.5,0.5); __m128 vcam0 = _mm_setr_ps(cielab_xyz_cam[0][0],cielab_xyz_cam[1][0],cielab_xyz_cam[2][0],0); __m128 vcam1 = _mm_setr_ps(cielab_xyz_cam[0][1],cielab_xyz_cam[1][1],cielab_xyz_cam[2][1],0); __m128 vcam2 = _mm_setr_ps(cielab_xyz_cam[0][2],cielab_xyz_cam[1][2],cielab_xyz_cam[2][2],0); __m128 vrgb0h = _mm_set1_ps(rgb.h.c[0]); __m128 vrgb1h = _mm_set1_ps(rgb.h.c[1]); __m128 vrgb2h = _mm_set1_ps(rgb.h.c[2]); __m128 vrgb0v = _mm_set1_ps(rgb.v.c[0]); __m128 vrgb1v = _mm_set1_ps(rgb.v.c[1]); __m128 vrgb2v = _mm_set1_ps(rgb.v.c[2]); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam0,vrgb0h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam1,vrgb1h)); xvxyz[0] = _mm_add_ps(xvxyz[0], _mm_mul_ps(vcam2,vrgb2h)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam0,vrgb0v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam1,vrgb1v)); xvxyz[1] = _mm_add_ps(xvxyz[1], _mm_mul_ps(vcam2,vrgb2v)); xvxyz[0] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[0], _MM_FROUND_TO_ZERO))); xvxyz[1] = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(xvxyz[1], _MM_FROUND_TO_ZERO))); __m128i loadaddrh = _mm_cvttps_epi32(xvxyz[0]); __m128i loadaddrv = _mm_cvttps_epi32(xvxyz[1]); #ifdef __AVX__ __m256 vlab, vxyz = { cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], 0}, vxyz2 = {0, cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], 0, cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], cielab_cbrt[_mm_extract_epi32(loadaddrv,0)]}; vlab = _mm256_sub_ps(vxyz,vxyz2); vlab = _mm256_mul_ps(vlab, _mm256_setr_ps(116,500,200,0,116,500,200,0)); vlab = _mm256_sub_ps(vlab, _mm256_setr_ps(16,0,0,0,16,0,0,0)); vlab = _mm256_mul_ps(vlab,_mm256_set1_ps(64)); vlab = _mm256_round_ps(vlab, _MM_FROUND_TO_ZERO); __m256i vlabi = _mm256_cvtps_epi32(vlab); return _mm_packs_epi32(_mm256_castsi256_si128(vlabi), ((__m128i*)&vlabi)[1]); #else __m128 vlabh, vxyzh = {cielab_cbrt[_mm_extract_epi32(loadaddrh,0)], cielab_cbrt[_mm_extract_epi32(loadaddrh,1)], cielab_cbrt[_mm_extract_epi32(loadaddrh,2)], 0}; __m128 vlabv, vxyzv = {cielab_cbrt[_mm_extract_epi32(loadaddrv,0)], cielab_cbrt[_mm_extract_epi32(loadaddrv,1)], cielab_cbrt[_mm_extract_epi32(loadaddrv,2)], 0}; vlabh = _mm_sub_ps(_mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzh,vxyzh,_MM_SHUFFLE(0,2,1,3))); vlabh = _mm_mul_ps(vlabh,_mm_setr_ps(116,500,200,0)); vlabh = _mm_sub_ps(vlabh,_mm_setr_ps(16,0,0,0)); vlabh = _mm_mul_ps(vlabh,_mm_set_ps1(64)); vlabh = _mm_round_ps(vlabh, _MM_FROUND_TO_ZERO); vlabv = _mm_sub_ps(_mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,1,0,1)), _mm_shuffle_ps(vxyzv,vxyzv,_MM_SHUFFLE(0,2,1,3))); vlabv = _mm_mul_ps(vlabv,_mm_setr_ps(116,500,200,0)); vlabv = _mm_sub_ps(vlabv,_mm_setr_ps(16,0,0,0)); vlabv = _mm_mul_ps(vlabv,_mm_set_ps1(64)); vlabv = _mm_round_ps(vlabv, _MM_FROUND_TO_ZERO); return _mm_set_epi64(_mm_cvtps_pi16(vlabv),_mm_cvtps_pi16(vlabh)); #endif }
void convert_to_rgb_fast() { unsigned i,j,c; int row, col, k; ushort *img; float out_cam[3][4]; double num, inverse[3][3]; static const double xyzd50_srgb[3][3] = { { 0.436083, 0.385083, 0.143055 }, { 0.222507, 0.716888, 0.060608 }, { 0.013930, 0.097097, 0.714022 } }; static const double rgb_rgb[3][3] = { { 1,0,0 }, { 0,1,0 }, { 0,0,1 } }; static const double adobe_rgb[3][3] = { { 0.715146, 0.284856, 0.000000 }, { 0.000000, 1.000000, 0.000000 }, { 0.000000, 0.041166, 0.958839 } }; static const double wide_rgb[3][3] = { { 0.593087, 0.404710, 0.002206 }, { 0.095413, 0.843149, 0.061439 }, { 0.011621, 0.069091, 0.919288 } }; static const double prophoto_rgb[3][3] = { { 0.529317, 0.330092, 0.140588 }, { 0.098368, 0.873465, 0.028169 }, { 0.016879, 0.117663, 0.865457 } }; static const double (*out_rgb[])[3] = { rgb_rgb, adobe_rgb, wide_rgb, prophoto_rgb, xyz_rgb }; static const char *name[] = { "sRGB", "Adobe RGB (1998)", "WideGamut D65", "ProPhoto D65", "XYZ" }; static const unsigned phead[] = { 1024, 0, 0x2100000, 0x6d6e7472, 0x52474220, 0x58595a20, 0, 0, 0, 0x61637370, 0, 0, 0x6e6f6e65, 0, 0, 0, 0, 0xf6d6, 0x10000, 0xd32d }; unsigned pbody[] = { 10, 0x63707274, 0, 36, /* cprt */ 0x64657363, 0, 40, /* desc */ 0x77747074, 0, 20, /* wtpt */ 0x626b7074, 0, 20, /* bkpt */ 0x72545243, 0, 14, /* rTRC */ 0x67545243, 0, 14, /* gTRC */ 0x62545243, 0, 14, /* bTRC */ 0x7258595a, 0, 20, /* rXYZ */ 0x6758595a, 0, 20, /* gXYZ */ 0x6258595a, 0, 20 }; /* bXYZ */ static const unsigned pwhite[] = { 0xf351, 0x10000, 0x116cc }; unsigned pcurve[] = { 0x63757276, 0, 1, 0x1000000 }; gamma_curve (gamm[0], gamm[1], 0, 0); memcpy (out_cam, rgb_cam, sizeof out_cam); raw_color |= colors == 1 || document_mode || output_color < 1 || output_color > 5; if (!raw_color) { oprof = (unsigned *) calloc (phead[0], 1); merror (oprof, "convert_to_rgb()"); memcpy (oprof, phead, sizeof phead); if (output_color == 5) oprof[4] = oprof[5]; oprof[0] = 132 + 12*pbody[0]; for (i=0; i < pbody[0]; i++) { oprof[oprof[0]/4] = i ? (i > 1 ? 0x58595a20 : 0x64657363) : 0x74657874; pbody[i*3+2] = oprof[0]; oprof[0] += (pbody[i*3+3] + 3) & -4; } memcpy (oprof+32, pbody, sizeof pbody); oprof[pbody[5]/4+2] = strlen(name[output_color-1]) + 1; memcpy ((char *)oprof+pbody[8]+8, pwhite, sizeof pwhite); pcurve[3] = (short)(256/gamm[5]+0.5) << 16; for (i=4; i < 7; i++) memcpy ((char *)oprof+pbody[i*3+2], pcurve, sizeof pcurve); pseudoinverse ((double (*)[3])out_rgb[output_color-1], inverse, 3); for (i=0; i < 3; i++) for (j=0; j < 3; j++) { for (num = k=0; k < 3; k++) num += xyzd50_srgb[i][k] * inverse[j][k]; oprof[pbody[j*3+23]/4+i+2] = num * 0x10000 + 0.5; } for (i=0; i < phead[0]/4; i++) oprof[i] = htonl(oprof[i]); strcpy ((char *)oprof+pbody[2]+8, "auto-generated by dcraw"); strcpy ((char *)oprof+pbody[5]+12, name[output_color-1]); for (i=0; i < 3; i++) for (j=0; j < colors; j++) for (out_cam[i][j] = k=0; k < 3; k++) out_cam[i][j] += out_rgb[output_color-1][i][k] * rgb_cam[k][j]; } if (verbose) fprintf (stderr, raw_color ? _("Building histograms...\n") : _("Converting to %s colorspace...\n"), name[output_color-1]); memset (histogram, 0, sizeof histogram); if(!raw_color) { __m128 outcam0= {out_cam[0][0],out_cam[1][0],out_cam[2][0],0}, outcam1= {out_cam[0][1],out_cam[1][1],out_cam[2][1],0}, outcam2= {out_cam[0][2],out_cam[1][2],out_cam[2][2],0}, outcam3= {out_cam[0][3],out_cam[1][3],out_cam[2][3],0}; for (img=image[0]; img < image[width*height]; img+=4) { __m128 out0; __m128 vimg0 = {img[0],img[0],img[0],0}, vimg1 = {img[1],img[1],img[1],0}, vimg2 = {img[2],img[2],img[2],0}, vimg3 = {img[3],img[3],img[3],0}; // out[0] = out_cam[0][0] * img[0] // +out_cam[0][1] * img[1] // +out_cam[0][2] * img[2] // +out_cam[0][3] * img[3]; // out[1] = out_cam[1][0] * img[0] // +out_cam[1][1] * img[1] // +out_cam[1][2] * img[2] // +out_cam[1][3] * img[3]; // out[2] = out_cam[2][0] * img[0] // +out_cam[2][1] * img[1] // +out_cam[2][2] * img[2] // +out_cam[2][3] * img[3]; out0 = _mm_add_ps(_mm_add_ps( _mm_mul_ps(vimg0, outcam0), _mm_mul_ps(vimg1, outcam1) ), _mm_add_ps( _mm_mul_ps(vimg2, outcam2), _mm_mul_ps(vimg3, outcam3) )); //clip out0 = _mm_max_ps(_mm_set1_ps(0), _mm_min_ps(_mm_set1_ps(0xffff), _mm_round_ps(out0, _MM_FROUND_TO_ZERO))); __m128i o = _mm_cvtps_epi32(out0); o = _mm_packus_epi32(o,_mm_setzero_si128()); memcpy(img, &o, sizeof(short)*3); FORCC histogram[c][img[c] >> 3]++; } } else if (document_mode) {
__m128 test_mm_round_ps(__m128 x) { // CHECK: define {{.*}} @test_mm_round_ps // CHECK: @llvm.x86.sse41.round.ps return _mm_round_ps(x, 2); }
__m128 test_mm_round_ps(__m128 x) { // CHECK-LABEL: test_mm_round_ps // CHECK: call <4 x float> @llvm.x86.sse41.round.ps // CHECK-ASM: roundps $2, %xmm{{.*}}, %xmm{{.*}} return _mm_round_ps(x, 2); }
__m128 test_mm_round_ps(__m128 x) { // CHECK-LABEL: test_mm_round_ps // CHECK: call <4 x float> @llvm.x86.sse41.round.ps(<4 x float> %{{.*}}, i32 4) return _mm_round_ps(x, 4); }