Exemplo n.º 1
0
Vec3 ComputePrincipleComponent( Sym3x3 const& matrix )
{
	Vec4 const row0( matrix[0], matrix[1], matrix[2], 0.0f );
	Vec4 const row1( matrix[1], matrix[3], matrix[4], 0.0f );
	Vec4 const row2( matrix[2], matrix[4], matrix[5], 0.0f );

	//Vec4 v = VEC4_CONST( 1.0f );
	//Vec4 v = row0; // row1, row2

	Vec3 v3 = EstimatePrincipleComponent( matrix );
	Vec4 v( v3.X(), v3.Y(), v3.Z(), 0.0f );

	for( int i = 0; i < POWER_ITERATION_COUNT; ++i )
	{
		// matrix multiply
		Vec4 w = row0*v.SplatX();
		w = MultiplyAdd(row1, v.SplatY(), w);
		w = MultiplyAdd(row2, v.SplatZ(), w);

		// get max component from xyz in all channels
		Vec4 a = Max(w.SplatX(), Max(w.SplatY(), w.SplatZ()));

		// divide through and advance
		v = w*Reciprocal(a);
	}
	return v.GetVec3();
}
Exemplo n.º 2
0
int main(int /*argc*/, char** /*argv*/) {

    std::string opts = ReadFileIntoString("MultiplyAdd.input");
    OptionParser parser(opts);
    double a = parser.Get<double>("A");
    double b = parser.Get<double>("B");
    double c = parser.Get<double>("C");
    double result = MultiplyAdd(a, b, c);
    std::cout << "a is " << a << ", b is " << b << ", c is " << c << std::endl;
    std::cout << "The result of MultiplyAdd on a, b, & c is: "
	      << result << std::endl;
 
    return EXIT_SUCCESS;
}
Exemplo n.º 3
0
Vec4 ClusterFit::SolveLeastSquares( Vec4& start, Vec4& end ) const
{
	// accumulate all the quantities we need
	int const count = m_colours->GetCount();
	Vec4 alpha2_sum = VEC4_CONST( 0.0f );
	Vec4 beta2_sum = VEC4_CONST( 0.0f );
	Vec4 alphabeta_sum = VEC4_CONST( 0.0f );
	Vec4 alphax_sum = VEC4_CONST( 0.0f );
	Vec4 betax_sum = VEC4_CONST( 0.0f );
	for( int i = 0; i < count; ++i )
	{
		Vec4 alpha = m_alpha[i];
		Vec4 beta = m_beta[i];
		Vec4 x = m_weighted[i];
	
		alpha2_sum = MultiplyAdd( alpha, alpha, alpha2_sum );
		beta2_sum = MultiplyAdd( beta, beta, beta2_sum );
		alphabeta_sum = MultiplyAdd( alpha, beta, alphabeta_sum );
		alphax_sum = MultiplyAdd( alpha, x, alphax_sum );
		betax_sum = MultiplyAdd( beta, x, betax_sum );	
	}

	// select the results
	Vec4 const zero = VEC4_CONST( 0.0f );
	Vec4 beta2_sum_zero = CompareEqual( beta2_sum, zero );
	Vec4 alpha2_sum_zero = CompareEqual( alpha2_sum, zero );
	
	Vec4 a1 = alphax_sum*Reciprocal( alpha2_sum );
	Vec4 b1 = betax_sum*Reciprocal( beta2_sum );
	
	Vec4 factor = Reciprocal( NegativeMultiplySubtract( 
		alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum 
	) );
	Vec4 a2 = NegativeMultiplySubtract( 
		betax_sum, alphabeta_sum, alphax_sum*beta2_sum
	)*factor;
	Vec4 b2 = NegativeMultiplySubtract(
		alphax_sum, alphabeta_sum, betax_sum*alpha2_sum
	)*factor;
	
	Vec4 a = Select( Select( a2, a1, beta2_sum_zero ), zero, alpha2_sum_zero );
	Vec4 b = Select( Select( b2, b1, alpha2_sum_zero ), zero, beta2_sum_zero );

	// clamp the output to [0, 1]
	Vec4 const one = VEC4_CONST( 1.0f );
	Vec4 const half = VEC4_CONST( 0.5f );
	a = Min( one, Max( zero, a ) );
	b = Min( one, Max( zero, b ) );

	// clamp to the grid
	Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
	Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );
	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );
	a = Truncate( MultiplyAdd( grid, a, half ) )*gridrcp;
	b = Truncate( MultiplyAdd( grid, b, half ) )*gridrcp;

	// compute the error
	Vec4 const two = VEC4_CONST( 2.0 );
	Vec4 e1 = MultiplyAdd( b*b, beta2_sum, m_xxsum );
	Vec4 e2 = MultiplyAdd( a, alphax_sum, b*betax_sum );
	Vec4 e3 = MultiplyAdd( a*a, alpha2_sum, e1 );
	Vec4 e4 = MultiplyAdd( a*b*alphabeta_sum - e2, two, e3 );

	// apply the metric to the error term
	Vec4 e5 = e4*m_metricSqr;
	Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();
	
	// save the start and end
	start = a;
	end = b;
	return error;
}
Exemplo n.º 4
0
void FastClusterFit::Compress4( void* block )
{
	Vec4 const one = VEC4_CONST(1.0f);
	Vec4 const zero = VEC4_CONST(0.0f);
	Vec4 const half = VEC4_CONST(0.5f);
	Vec4 const two = VEC4_CONST(2.0);
	Vec4 const onethird = VEC4_CONST( 1.0f/3.0f );
	Vec4 const twothirds = VEC4_CONST( 2.0f/3.0f );

	// declare variables
	Vec4 beststart = VEC4_CONST( 0.0f );
	Vec4 bestend = VEC4_CONST( 0.0f );
	Vec4 besterror = VEC4_CONST( FLT_MAX );

	Vec4 x0 = zero;
	int b0 = 0, b1 = 0, b2 = 0;
	int i = 0;

	// check all possible clusters for this total order
	for( int c0 = 0; c0 <= 16; c0++)
	{	
		Vec4 x1 = zero;
		
		for( int c1 = 0; c1 <= 16-c0; c1++)
		{	
			Vec4 x2 = zero;
			
			for( int c2 = 0; c2 <= 16-c0-c1; c2++)
			{
				Vec4 const constants = Vec4((const float *)&s_fourElement[i]);
				Vec4 const alpha2_sum = constants.SplatX();
				Vec4 const beta2_sum = constants.SplatY();
				Vec4 const alphabeta_sum = constants.SplatZ();
				Vec4 const factor = constants.SplatW();
				i++;
				
				Vec4 const alphax_sum = x0 + MultiplyAdd(x1, twothirds, x2 * onethird);
				Vec4 const betax_sum = m_xsum - alphax_sum;
				
				Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
				Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;
				
				// clamp the output to [0, 1]
				a = Min( one, Max( zero, a ) );
				b = Min( one, Max( zero, b ) );
				
				// clamp to the grid
				Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
				Vec4 const gridrcp( 0.03227752766457f, 0.01583151765563f, 0.03227752766457f, 0.0f );
				a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
				b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;
				
				// compute the error
				Vec4 e1 = MultiplyAdd( a, alphax_sum, b*betax_sum );
				Vec4 e2 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
				Vec4 e3 = MultiplyAdd( a*b*alphabeta_sum - e1, two, e2 );
				
				// apply the metric to the error term
				Vec4 e4 = e3 * m_metricSqr;
				Vec4 error = e4.SplatX() + e4.SplatY() + e4.SplatZ();
				
				// keep the solution if it wins
				if( CompareAnyLessThan( error, besterror ) )
				{
					besterror = error;
					beststart = a;
					bestend = b;
					b0 = c0;
					b1 = c1;
					b2 = c2;
				}
				
				x2 += m_unweighted[c0+c1+c2];
			}
			
			x1 += m_unweighted[c0+c1];
		}
		
		x0 += m_unweighted[c0];
	}

	// save the block if necessary
	if( CompareAnyLessThan( besterror, m_besterror ) )
	{
		// compute indices from cluster sizes.
		/*uint bestindices = 0;
		{
			int i = b0;
			for(; i < b0+b1; i++) {
				bestindices = 2 << (2 * m_order[i]);
			}
			for(; i < b0+b1+b2; i++) {
				bestindices = 3 << (2 * m_order[i]);
			}
			for(; i < 16; i++) {
				bestindices = 1 << (2 * m_order[i]);
			}
		}*/
		u8 bestindices[16];
		{
			int i = 0;
			for(; i < b0; i++) {
				bestindices[i] = 0;
			}
			for(; i < b0+b1; i++) {
				bestindices[i] = 2;
			}
			for(; i < b0+b1+b2; i++) {
				bestindices[i] = 3;
			}
			for(; i < 16; i++) {
				bestindices[i] = 1;
			}
		}
		
		// remap the indices
		u8 ordered[16];
		for( int i = 0; i < 16; ++i )
			ordered[m_order[i]] = bestindices[i];
		
		// save the block
		WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), ordered, block );
		
		// save the error
		m_besterror = besterror;
	}
}
void WeightedClusterFit::Compress4( void* block )
{
    int const count = m_colours->GetCount();
    Vec4 const one = VEC4_CONST(1.0f);
    Vec4 const zero = VEC4_CONST(0.0f);
    Vec4 const half = VEC4_CONST(0.5f);
    Vec4 const two = VEC4_CONST(2.0);
    Vec4 const onethird( 1.0f/3.0f, 1.0f/3.0f, 1.0f/3.0f, 1.0f/9.0f );
    Vec4 const twothirds( 2.0f/3.0f, 2.0f/3.0f, 2.0f/3.0f, 4.0f/9.0f );
    Vec4 const twonineths = VEC4_CONST( 2.0f/9.0f );
    Vec4 const grid( 31.0f, 63.0f, 31.0f, 0.0f );
    Vec4 const gridrcp( 1.0f/31.0f, 1.0f/63.0f, 1.0f/31.0f, 0.0f );

    // declare variables
    Vec4 beststart = VEC4_CONST( 0.0f );
    Vec4 bestend = VEC4_CONST( 0.0f );
    Vec4 besterror = VEC4_CONST( FLT_MAX );

    Vec4 x0 = zero;
    int b0 = 0, b1 = 0, b2 = 0;

    // check all possible clusters for this total order
    for( int c0 = 0; c0 < count; c0++)
    {
        Vec4 x1 = zero;

        for( int c1 = 0; c1 < count-c0; c1++)
        {
            Vec4 x2 = zero;

            for( int c2 = 0; c2 < count-c0-c1; c2++)
            {
                Vec4 const x3 = m_xsum - x2 - x1 - x0;

                //Vec3 const alphax_sum = x0 + x1 * (2.0f / 3.0f) + x2 * (1.0f / 3.0f);
                //float const alpha2_sum = w0 + w1 * (4.0f/9.0f) + w2 * (1.0f/9.0f);
                Vec4 const alphax_sum = MultiplyAdd(x2, onethird, MultiplyAdd(x1, twothirds, x0)); // alphax_sum, alpha2_sum
                Vec4 const alpha2_sum = alphax_sum.SplatW();

                //Vec3 const betax_sum = x3 + x2 * (2.0f / 3.0f) + x1 * (1.0f / 3.0f);
                //float const beta2_sum = w3 + w2 * (4.0f/9.0f) + w1 * (1.0f/9.0f);
                Vec4 const betax_sum = MultiplyAdd(x2, twothirds, MultiplyAdd(x1, onethird, x3)); // betax_sum, beta2_sum
                Vec4 const beta2_sum = betax_sum.SplatW();

                //float const alphabeta_sum = (w1 + w2) * (2.0f/9.0f);
                Vec4 const alphabeta_sum = twonineths*( x1 + x2 ).SplatW(); // alphabeta_sum

                // float const factor = 1.0f / (alpha2_sum * beta2_sum - alphabeta_sum * alphabeta_sum);
                Vec4 const factor = Reciprocal( NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum*beta2_sum) );

                Vec4 a = NegativeMultiplySubtract(betax_sum, alphabeta_sum, alphax_sum*beta2_sum) * factor;
                Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum, betax_sum*alpha2_sum) * factor;

                // clamp to the grid
                a = Min( one, Max( zero, a ) );
                b = Min( one, Max( zero, b ) );
                a = Truncate( MultiplyAdd( grid, a, half ) ) * gridrcp;
                b = Truncate( MultiplyAdd( grid, b, half ) ) * gridrcp;

                // compute the error (we skip the constant xxsum)
                Vec4 e1 = MultiplyAdd( a*a, alpha2_sum, b*b*beta2_sum );
                Vec4 e2 = NegativeMultiplySubtract( a, alphax_sum, a*b*alphabeta_sum );
                Vec4 e3 = NegativeMultiplySubtract( b, betax_sum, e2 );
                Vec4 e4 = MultiplyAdd( two, e3, e1 );

                // apply the metric to the error term
                Vec4 e5 = e4 * m_metricSqr;
                Vec4 error = e5.SplatX() + e5.SplatY() + e5.SplatZ();

                // keep the solution if it wins
                if( CompareAnyLessThan( error, besterror ) )
                {
                    besterror = error;
                    beststart = a;
                    bestend = b;
                    b0 = c0;
                    b1 = c1;
                    b2 = c2;
                }

                x2 += m_weighted[c0+c1+c2];
            }

            x1 += m_weighted[c0+c1];
        }

        x0 += m_weighted[c0];
    }

    // save the block if necessary
    if( CompareAnyLessThan( besterror, m_besterror ) )
    {
        // compute indices from cluster sizes.
        u8 bestindices[16];
        {
            int i = 0;
            for(; i < b0; i++) {
                bestindices[i] = 0;
            }
            for(; i < b0+b1; i++) {
                bestindices[i] = 2;
            }
            for(; i < b0+b1+b2; i++) {
                bestindices[i] = 3;
            }
            for(; i < count; i++) {
                bestindices[i] = 1;
            }
        }

        // remap the indices
        u8 ordered[16];
        for( int i = 0; i < count; ++i )
            ordered[m_order[i]] = bestindices[i];

        m_colours->RemapIndices( ordered, bestindices );

        // save the block
        WriteColourBlock4( beststart.GetVec3(), bestend.GetVec3(), bestindices, block );

        // save the error
        m_besterror = besterror;
    }
}
Exemplo n.º 6
0
void BitoneClusterFit::ClusterFit4Constant(void* block)
{
  // declare variables
  int const count = m_bitones->GetCount();
  Vec4 const two = VEC4_CONST(2.0f);
  Vec4 const one = VEC4_CONST(1.0f);

  Vec4 const onethird_onethird2  (1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 9.0f);
  Vec4 const twothirds_twothirds2(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 4.0f / 9.0f);
  Vec4 const twonineths                                     = VEC4_CONST(2.0f / 9.0f);

  assume((count > 0) && (count <= 16));

  // check all possible clusters and iterate on the total order
  Vec4 beststart = VEC4_CONST(0.0f);
  Vec4 bestend = VEC4_CONST(0.0f);
  Scr4 besterror = m_besterror;
  u8 bestindices[16];
  int bestiteration = 0;
  int besti = 0, bestj = 0, bestk = 0;

  // prepare an ordering using the principle axis
  ConstructOrdering(m_principle, 0);

  // loop over iterations (we avoid the case that all points in first or last cluster)
  for (int iterationIndex = 0;;) {
    // cache some values
    Vec4 const xsum_wsum = m_xsum_wsum;

    // constants if weights == 1
    Vec4 alphabeta_dltas  = *((Vec4 *)part2delta[0]);
    Vec4 *alphabeta_inits = (Vec4 *)part2inits[0];
    float *alphabeta_factors = (float *)part2factors;

#if 0
  Vec4 lasta = Vec4(0.0f);
  Vec4 lastb = xsum_wsum;
  Vec4 lastc = Vec4(0.0f);
#endif

    // first cluster [0,i) is at the start
    Vec4 part0 = VEC4_CONST(0.0f);
    for (int i = 0; i < count; ++i) {

    // second cluster [i,j) is one third along
    Vec4 part1 = VEC4_CONST(0.0f);
    for (int j = i;;) {

    // third cluster [j,k) is two thirds along
    Vec4 part2 = (j == 0) ? m_points_weights[0] : VEC4_CONST(0.0f);
    Vec4 alphabeta_val = *alphabeta_inits++;
    int kmin = (j == 0) ? 1 : j;
    for (int k = kmin;;) {
	  // TODO: the inner alphabeta_sum seems always to be the same sequence
	  Vec4 alphabeta_factor = alphabeta_val * Vec4(*alphabeta_factors++);

	  // compute least squares terms directly
	  Vec4 const alphax_sum =   MultiplyAdd(part2, onethird_onethird2, MultiplyAdd(part1, twothirds_twothirds2, part0));
	  Vec4 const  betax_sum = /*MultiplyAdd(part1, onethird_onethird2, MultiplyAdd(part2, twothirds_twothirds2, part3))*/ xsum_wsum - alphax_sum;

	  Vec4 const    alpha2_sum = alphabeta_val.SplatX();
	  Vec4 const     beta2_sum = alphabeta_val.SplatY();
	  Vec4 const alphabeta_sum = alphabeta_val.SplatZ();

	  Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_factor.SplatZ(), alphax_sum * alphabeta_factor.SplatY());
	  Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_factor.SplatZ(),  betax_sum * alphabeta_factor.SplatX());

#if 0
	  // last cluster [k,count) is at the end
	  Vec4 part3 = xsum_wsum - part2 - part1 - part0;

	  // compute least squares terms directly
	  Vec4 const _alphax_sum = MultiplyAdd(part2, onethird_onethird2, MultiplyAdd(part1, twothirds_twothirds2, part0));
	  Vec4 const  _betax_sum = MultiplyAdd(part1, onethird_onethird2, MultiplyAdd(part2, twothirds_twothirds2, part3));
//	  Vec4 const  _betac_sum = xsum_wsum - _alphax_sum;

	  Vec4 const _alpha2_sum = _alphax_sum.SplatW();
	  Vec4 const  _beta2_sum =  _betax_sum.SplatW();

	  Vec4 const _alphabeta_sum = twonineths * (part1 + part2).SplatW();

	  // compute the least-squares optimal points
	  Vec4 _factor = Reciprocal(NegativeMultiplySubtract(_alphabeta_sum, _alphabeta_sum, _alpha2_sum * _beta2_sum));
	  Vec4 _a = NegativeMultiplySubtract( _betax_sum, _alphabeta_sum, _alphax_sum *  _beta2_sum) * _factor;
	  Vec4 _b = NegativeMultiplySubtract(_alphax_sum, _alphabeta_sum,  _betax_sum * _alpha2_sum) * _factor;

#undef	limit
#define	limit 2e-5
	  assert(fabs(_alpha2_sum.W() - alpha2_sum.X()) < limit);
	  assert(fabs(_beta2_sum.W() - beta2_sum.X()) < limit);
	  assert(fabs(_alphabeta_sum.W() - alphabeta_sum.X()) < limit);

	  if (alphabeta_factors[-1] != FLT_MAX) {
	    assert(fabs(_factor.W() - alphabeta_factors[-1]) < limit);

	    assert(fabs(_alpha2_sum.W()    * _factor.W() - alphabeta_factor.X()) < limit);
	    assert(fabs(_beta2_sum.W()     * _factor.W() - alphabeta_factor.Y()) < limit);
	    assert(fabs(_alphabeta_sum.W() * _factor.W() - alphabeta_factor.Z()) < limit);

	    assert(fabs(a.X() - _a.X()) < limit);
	    assert(fabs(a.Y() - _a.Y()) < limit);
	    assert(fabs(a.Z() - _a.Z()) < limit);

	    assert(fabs(b.X() - _b.X()) < limit);
	    assert(fabs(b.Y() - _b.Y()) < limit);
	    assert(fabs(b.Z() - _b.Z()) < limit);
	  }

#if 0
	  fprintf(stderr, "{%.9ff},", _factor.W());
	  if (k == kmin)
	    fprintf(stderr, "{%.9f, %.9f, %.9f},\n", alpha2_sum.W(), beta2_sum.W(), alphabeta_sum.W());
	  fprintf(stderr, "{%.9f/*%.9f*/,%.9f/*%.9f*/,%.9f,%.9f},\n",
	    alpha2_sum.W(), lasta.W() - alpha2_sum.W(),
	    beta2_sum.W(), lastb.W() - beta2_sum.W(),
	    alphabeta_sum.W(), lastc.W() - alphabeta_sum.W(),
	    factor.W());

	  lasta = alpha2_sum;
	  lastb = beta2_sum;
	  lastc = alphabeta_sum;
#endif
#endif

	  // snap floating-point-values to the integer-lattice
	  a = Truncate(a * 255.0f) * (1.0f / 255.0f);
	  b = Truncate(b * 255.0f) * (1.0f / 255.0f);

	  // compute the error (we skip the constant xxsum)
	  Vec4 e1 = MultiplyAdd(a * a, alpha2_sum, b * b * beta2_sum);
	  Vec4 e2 = NegativeMultiplySubtract(a, alphax_sum, a * b * alphabeta_sum);
	  Vec4 e3 = NegativeMultiplySubtract(b, betax_sum, e2);
	  Vec4 e4 = MultiplyAdd(two, e3, e1);

	  // apply the metric to the error term
	  Scr4 eS = e4;

	  // keep the solution if it wins
	  if (besterror > eS) {
	    besterror = eS;

	    beststart = a;
	    bestend = b;
	    bestiteration = iterationIndex;

	    besti = i;
	    bestj = j;
	    bestk = k;
	  }

      alphabeta_val += alphabeta_dltas;

      // advance
      if (k == count) break;
      part2 += m_points_weights[k]; ++k; }

      // advance
      if (j == count) break;
      part1 += m_points_weights[j]; ++j; }

      // advance
      part0 += m_points_weights[i];
    }

    // stop if we didn't improve in this iteration
    if (bestiteration != iterationIndex)
      break;

    // advance if possible
    ++iterationIndex;
    if (iterationIndex == m_iterationCount)
      break;

    // stop if a new iteration is an ordering that has already been tried
    Vec3 axis = (bestend - beststart).GetVec3();
    if (!ConstructOrdering(axis, iterationIndex))
      break;
  }

  // save the block if necessary
  if (besterror < m_besterror) {
    // save the error
    m_besterror = besterror;

    // remap the indices
    u8 const* order = (u8*)m_order + 16 * bestiteration;

    u8 unordered[16];
    for (int m =     0; m < besti; ++m)
      unordered[order[m]] = 0;
    for (int m = besti; m < bestj; ++m)
      unordered[order[m]] = 2;
    for (int m = bestj; m < bestk; ++m)
      unordered[order[m]] = 3;
    for (int m = bestk; m < count; ++m)
      unordered[order[m]] = 1;

    m_bitones->RemapIndices(unordered, bestindices);

    // save the block
    WriteBitoneBlock4(beststart.GetVec3(), bestend.GetVec3(), bestindices, block);
  }
}
Exemplo n.º 7
0
void BitoneClusterFit::ClusterFit4(void* block)
{
  // declare variables
  int const count = m_bitones->GetCount();
  Vec4 const two = VEC4_CONST(2.0f);
  Vec4 const one = VEC4_CONST(1.0f);

  Vec4 const onethird_onethird2  (1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 3.0f, 1.0f / 9.0f);
  Vec4 const twothirds_twothirds2(2.0f / 3.0f, 2.0f / 3.0f, 2.0f / 3.0f, 4.0f / 9.0f);
  Vec4 const twonineths                                     = VEC4_CONST(2.0f / 9.0f);

  assume((count > 0) && (count <= 16));

  // prepare an ordering using the principle axis
  ConstructOrdering(m_principle, 0);

  // check all possible clusters and iterate on the total order
  Vec4 beststart = VEC4_CONST(0.0f);
  Vec4 bestend = VEC4_CONST(0.0f);
  Scr4 besterror = m_besterror;
  u8 bestindices[16];
  int bestiteration = 0;
  int besti = 0, bestj = 0, bestk = 0;

  // loop over iterations (we avoid the case that all points in first or last cluster)
  for (int iterationIndex = 0;;) {
    // first cluster [0,i) is at the start
    Vec4 part0 = VEC4_CONST(0.0f);
    for (int i = 0; i < count; ++i) {
      // second cluster [i,j) is one third along
      Vec4 part1 = VEC4_CONST(0.0f);
      for (int j = i;;) {
	// third cluster [j,k) is two thirds along
	Vec4 part2 = (j == 0) ? m_points_weights[0] : VEC4_CONST(0.0f);
	int kmin = (j == 0) ? 1 : j;
	for (int k = kmin;;) {
	  // last cluster [k,count) is at the end
	  Vec4 part3 = m_xsum_wsum - part2 - part1 - part0;

	  // compute least squares terms directly
	  Vec4 const alphax_sum = MultiplyAdd(part2, onethird_onethird2, MultiplyAdd(part1, twothirds_twothirds2, part0));
	  Vec4 const  betax_sum = MultiplyAdd(part1, onethird_onethird2, MultiplyAdd(part2, twothirds_twothirds2, part3));

	  Vec4 const alpha2_sum = alphax_sum.SplatW();
	  Vec4 const  beta2_sum =  betax_sum.SplatW();

	  Vec4 const alphabeta_sum = twonineths * (part1 + part2).SplatW();

	  // compute the least-squares optimal points
	  Vec4 factor = Reciprocal(NegativeMultiplySubtract(alphabeta_sum, alphabeta_sum, alpha2_sum * beta2_sum));
	  Vec4 a = NegativeMultiplySubtract( betax_sum, alphabeta_sum, alphax_sum *  beta2_sum) * factor;
	  Vec4 b = NegativeMultiplySubtract(alphax_sum, alphabeta_sum,  betax_sum * alpha2_sum) * factor;

	  // snap floating-point-values to the integer-lattice
	  a = Truncate(a * 255.0f) * (1.0f / 255.0f);
	  b = Truncate(b * 255.0f) * (1.0f / 255.0f);

	  // compute the error (we skip the constant xxsum)
	  Vec4 e1 = MultiplyAdd(a * a, alpha2_sum, b * b * beta2_sum);
	  Vec4 e2 = NegativeMultiplySubtract(a, alphax_sum, a * b * alphabeta_sum);
	  Vec4 e3 = NegativeMultiplySubtract(b, betax_sum, e2);
	  Vec4 e4 = MultiplyAdd(two, e3, e1);

	  // apply the metric to the error term
	  Scr4 eS = e4;

	  // keep the solution if it wins
	  if (besterror > eS) {
	    besterror = eS;
	    beststart = a;
	    bestend = b;
	    besti = i;
	    bestj = j;
	    bestk = k;
	    bestiteration = iterationIndex;
	  }

	  // advance
	  if (k == count) break;
	  part2 += m_points_weights[k]; ++k;
	}

	// advance
	if (j == count) break;
	part1 += m_points_weights[j]; ++j;
      }

      // advance
      part0 += m_points_weights[i];
    }

    // stop if we didn't improve in this iteration
    if (bestiteration != iterationIndex)
      break;

    // advance if possible
    ++iterationIndex;
    if (iterationIndex == m_iterationCount)
      break;

    // stop if a new iteration is an ordering that has already been tried
    Vec3 axis = (bestend - beststart).GetVec3();
    if (!ConstructOrdering(axis, iterationIndex))
      break;
  }

  // save the block if necessary
  if (besterror < m_besterror) {
    // save the error
    m_besterror = besterror;

    // remap the indices
    u8 const* order = (u8*)m_order + 16 * bestiteration;

    u8 unordered[16];
    for (int m =     0; m < besti; ++m)
      unordered[order[m]] = 0;
    for (int m = besti; m < bestj; ++m)
      unordered[order[m]] = 2;
    for (int m = bestj; m < bestk; ++m)
      unordered[order[m]] = 3;
    for (int m = bestk; m < count; ++m)
      unordered[order[m]] = 1;

    m_bitones->RemapIndices(unordered, bestindices);

    // save the block
    WriteBitoneBlock4(beststart.GetVec3(), bestend.GetVec3(), bestindices, block);
  }
}