void DataFrameDiscretizer::discretize(DataFrame& df, TgsProgress* progress) { _df = &df; for (unsigned int i = 0; i < df.getNumFactors(); i++) { if (progress) { progress->setProgress((double)i / (double)df.getNumFactors()); } if (_df->isNominal(i) == false) { if (_df->getNullTreatment(i) == DataFrame::NullAsMissingValue) { // replace nulls with random sampling of data (imputation), otherwise nulls get put // into their own category. _replaceNulls(i); } _discretizeColumn(i); } } if (progress) { progress->setProgress(1.0); } }
std::vector<std::string> GreedyStepwiseSearch::findSubset(const DataFrame& df, TgsProgress* p) { TgsProgress* dataFrameProgress = NULL; TgsProgress* evaluateProgress = NULL; if (p) { dataFrameProgress = p->createTgsChild("Data Frame Processing", .5); evaluateProgress = p->createTgsChild("Evaluate Subsets", .5); } std::vector<std::string> result; _fse->setDataFrame(df, dataFrameProgress); assert(_direction == Backward); _progress = evaluateProgress; _iteration = 0; _totalIterations = (df.getNumFactors() * (df.getNumFactors() + 1)) / 2; double bestScore = -1e300; vector<int> bestSolution; if (_direction == Backward) { vector<int> columns; for (unsigned int i = 0; i < df.getNumFactors(); i++) { columns.push_back(i); } double score = _fse->evaluateSubset(columns); bestScore = score; bestSolution = columns; // printf("score: %.3f size: %d\n", score, columns.size()); do { score = _removeWorst(columns); // printf("score: %.3f size: %d\n", score, columns.size()); for (unsigned int i = 0; i < columns.size(); i++) { string s = df.getFactorLabelFromIndex(columns[i]); char* s2 = (char*)s.c_str(); s2[4] = 0; // printf("%s\t", s2); } // printf("\n"); if (score >= bestScore) { bestScore = score; bestSolution = columns; } } while (columns.size() > 1); } result.clear(); for (unsigned int i = 0; i < bestSolution.size(); i++) { result.push_back(df.getFactorLabelFromIndex(bestSolution[i])); } if (p) { p->setProgress(1.0); } return result; }
void PrincipalComponentsAnalysis::compute(DataFrame& df) { if (df.getNumFactors() > 2) { // see PrincipalComponentsAnalysisTest cout << "You realize this hasn't been tested, right?" << endl; } Matrix dataMat(df.getNumFactors(), df.getNumDataVectors()); Matrix deviates(df.getNumFactors(), df.getNumDataVectors()); SymmetricMatrix covar(df.getNumFactors()); DiagonalMatrix eigenValues(df.getNumFactors()); Matrix eigenVectors; ColumnVector means(df.getNumFactors()); means = 0.0; RowVector h(df.getNumDataVectors()); h = 1.0; for (unsigned int j = 0; j < df.getNumFactors(); j++) { if (df.isNominal(j)) { throw Tgs::Exception("Only numeric values are supported."); } } for(unsigned int i = 0; i < df.getNumDataVectors(); i++) { for (unsigned int j = 0; j < df.getNumFactors(); j++) { double v = df.getDataElement(i, j); if (df.isNull(v)) { throw Tgs::Exception("Only non-null values are supported."); } dataMat.element(j, i) = v; means.element(j) += v / (double)df.getNumDataVectors(); } } try { deviates = dataMat - (means * h); covar << (1.0/(float)df.getNumDataVectors()) * (deviates * deviates.t()); Jacobi::jacobi(covar, eigenValues, eigenVectors); } catch (const std::exception&) { throw; } catch (...) { throw Tgs::Exception("Unknown error while calculating PCA"); } _sortEigens(eigenVectors, eigenValues); _components.resize(df.getNumFactors()); for (unsigned int v = 0; v < df.getNumFactors(); v++) { _components[v].resize(df.getNumFactors()); for (unsigned int d = 0; d < df.getNumFactors(); d++) { _components[v][d] = eigenVectors.element(d, v); } } }