void DataFrameDiscretizer::discretize(DataFrame& df, TgsProgress* progress)
  {
    _df = &df;

    for (unsigned int i = 0; i < df.getNumFactors(); i++)
    {
      if (progress)
      {
        progress->setProgress((double)i / (double)df.getNumFactors());
      }
      if (_df->isNominal(i) == false)
      {
        if (_df->getNullTreatment(i) == DataFrame::NullAsMissingValue)
        {
          // replace nulls with random sampling of data (imputation), otherwise nulls get put
          // into their own category.
          _replaceNulls(i);
        } 
        _discretizeColumn(i);
      }
    }
    if (progress)
    {
      progress->setProgress(1.0);
    }
  }
  std::vector<std::string> GreedyStepwiseSearch::findSubset(const DataFrame& df, 
    TgsProgress* p)
  {
    TgsProgress* dataFrameProgress = NULL;
    TgsProgress* evaluateProgress = NULL;
    if (p)
    {
      dataFrameProgress = p->createTgsChild("Data Frame Processing", .5);
      evaluateProgress = p->createTgsChild("Evaluate Subsets", .5);
    }
    std::vector<std::string> result;

    _fse->setDataFrame(df, dataFrameProgress);

    assert(_direction == Backward);

    _progress = evaluateProgress;
    _iteration = 0;
    _totalIterations = (df.getNumFactors() * (df.getNumFactors() + 1)) / 2;

    double bestScore = -1e300;
    vector<int> bestSolution;
    if (_direction == Backward)
    {
      vector<int> columns;

      for (unsigned int i = 0; i < df.getNumFactors(); i++)
      {
        columns.push_back(i);
      }

      double score = _fse->evaluateSubset(columns);
      bestScore = score;
      bestSolution = columns;
//       printf("score: %.3f size: %d\n", score, columns.size());
      do
      {
        score = _removeWorst(columns);

//         printf("score: %.3f size: %d\n", score, columns.size());
        for (unsigned int i = 0; i < columns.size(); i++)
        {
          string s = df.getFactorLabelFromIndex(columns[i]);
          char* s2 = (char*)s.c_str();
          s2[4] = 0;
//           printf("%s\t", s2);
        }
//         printf("\n");
        if (score >= bestScore)
        {
          bestScore = score;
          bestSolution = columns;
        }
      } while (columns.size() > 1);
    }

    result.clear();
    for (unsigned int i = 0; i < bestSolution.size(); i++)
    {
      result.push_back(df.getFactorLabelFromIndex(bestSolution[i]));
    }

    if (p)
    {
      p->setProgress(1.0);
    }

    return result;
  }
  void PrincipalComponentsAnalysis::compute(DataFrame& df)
  {
    if (df.getNumFactors() > 2)
    {
      // see PrincipalComponentsAnalysisTest
      cout << "You realize this hasn't been tested, right?" << endl;
    }
    Matrix dataMat(df.getNumFactors(), df.getNumDataVectors());
    Matrix deviates(df.getNumFactors(), df.getNumDataVectors());
    SymmetricMatrix covar(df.getNumFactors());
    DiagonalMatrix eigenValues(df.getNumFactors());
    Matrix eigenVectors;
    ColumnVector means(df.getNumFactors());
    means = 0.0;
    RowVector h(df.getNumDataVectors());
    h = 1.0;

    for (unsigned int j = 0; j < df.getNumFactors(); j++)
    {
      if (df.isNominal(j))
      {
        throw Tgs::Exception("Only numeric values are supported.");
      }
    }


    for(unsigned int i = 0; i < df.getNumDataVectors(); i++)
    {
      for (unsigned int j = 0; j < df.getNumFactors(); j++)
      {
        double v = df.getDataElement(i, j);
        if (df.isNull(v))
        {
          throw Tgs::Exception("Only non-null values are supported.");
        }
        dataMat.element(j, i) = v;
        means.element(j) += v / (double)df.getNumDataVectors();
      }
    }

    try
    {
      deviates = dataMat - (means * h);
      covar << (1.0/(float)df.getNumDataVectors()) * (deviates * deviates.t());
      Jacobi::jacobi(covar, eigenValues, eigenVectors);
    }
    catch (const std::exception&)
    {
      throw;
    }
    catch (...)
    {
      throw Tgs::Exception("Unknown error while calculating PCA");
    }

    _sortEigens(eigenVectors, eigenValues);

    _components.resize(df.getNumFactors());
    for (unsigned int v = 0; v < df.getNumFactors(); v++)
    {
      _components[v].resize(df.getNumFactors());
      for (unsigned int d = 0; d < df.getNumFactors(); d++)
      {
        _components[v][d] = eigenVectors.element(d, v);
      }
    }
  }