double SymmetricUncertaintyCalculator::_calculateEntropy(const DataFrame& df, int factorIndex) { typedef HashMap<int, int> ClassCounts; ClassCounts cc; for(unsigned int i = 0; i < df.getNumDataVectors(); i++) { double v = df.getDataElement(i, factorIndex); // null values are not supported Use the DataFrameDiscretizer to "fix" nulls if (DataFrame::isNull(v) == true) { throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator"); } cc[(int)(v + .5)]++; } double sum = 0.0; double totalSize = df.getNumDataVectors(); for (ClassCounts::const_iterator classIt = cc.begin(); classIt != cc.end(); classIt++) { double count = classIt->second; sum += count / totalSize * log2(count / totalSize); } return -sum; }
double SymmetricUncertaintyCalculator::_calculateConditionalEntropy(const DataFrame& dfY, int factorIndexY, const DataFrame& dfX, int factorIndexX) { CondClassCounts ccc; ClassCounts cc; for(unsigned int i = 0; i < dfX.getNumDataVectors(); i++) { double vx = dfX.getDataElement(i, factorIndexX); if (DataFrame::isNull(vx) == true) { throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator"); } int ex = (int)(vx + 0.5); // x enumeration double vy = dfY.getDataElement(i, factorIndexY); if (DataFrame::isNull(vy) == true) { throw Tgs::Exception("Null values are not supported by SymmetricUncertaintyCalculator"); } int ey = (int)(vy + 0.5); // y enumeration ccc[ex][ey]++; cc[ex]++; } double sumX = 0.0; double totalSize = dfX.getNumDataVectors(); for (CondClassCounts::const_iterator condIt = ccc.begin(); condIt != ccc.end(); condIt++) { const ClassCounts& classCounts = condIt->second; double px = (double)cc[condIt->first] / totalSize; // p(x) double sumY = 0.0; for (ClassCounts::const_iterator classIt = classCounts.begin(); classIt != classCounts.end(); classIt++) { double count = classIt->second; double pyx = count / (double)cc[condIt->first]; // p(y | x) sumY += pyx * log2(pyx); } sumX += px * sumY; } return -sumX; }
void PrincipalComponentsAnalysis::compute(DataFrame& df) { if (df.getNumFactors() > 2) { // see PrincipalComponentsAnalysisTest cout << "You realize this hasn't been tested, right?" << endl; } Matrix dataMat(df.getNumFactors(), df.getNumDataVectors()); Matrix deviates(df.getNumFactors(), df.getNumDataVectors()); SymmetricMatrix covar(df.getNumFactors()); DiagonalMatrix eigenValues(df.getNumFactors()); Matrix eigenVectors; ColumnVector means(df.getNumFactors()); means = 0.0; RowVector h(df.getNumDataVectors()); h = 1.0; for (unsigned int j = 0; j < df.getNumFactors(); j++) { if (df.isNominal(j)) { throw Tgs::Exception("Only numeric values are supported."); } } for(unsigned int i = 0; i < df.getNumDataVectors(); i++) { for (unsigned int j = 0; j < df.getNumFactors(); j++) { double v = df.getDataElement(i, j); if (df.isNull(v)) { throw Tgs::Exception("Only non-null values are supported."); } dataMat.element(j, i) = v; means.element(j) += v / (double)df.getNumDataVectors(); } } try { deviates = dataMat - (means * h); covar << (1.0/(float)df.getNumDataVectors()) * (deviates * deviates.t()); Jacobi::jacobi(covar, eigenValues, eigenVectors); } catch (const std::exception&) { throw; } catch (...) { throw Tgs::Exception("Unknown error while calculating PCA"); } _sortEigens(eigenVectors, eigenValues); _components.resize(df.getNumFactors()); for (unsigned int v = 0; v < df.getNumFactors(); v++) { _components[v].resize(df.getNumFactors()); for (unsigned int d = 0; d < df.getNumFactors(); d++) { _components[v][d] = eigenVectors.element(d, v); } } }