void CfsSubsetEvaluator::setDataFrame(const DataFrame& dataFrame, TgsProgress* progress) { TgsProgress* discretizeProgress = NULL; TgsProgress* classCorrelationProgress = NULL; TgsProgress* correlationMatrixProgress = NULL; if (progress) { discretizeProgress = progress->createTgsChild("Discretizing Data Frame", .5); classCorrelationProgress = progress->createTgsChild("Calculating Class Correlation", .1); correlationMatrixProgress = progress->createTgsChild("Calculating Correlation Matrix", .4); } _df = dataFrame; DataFrameDiscretizer dfd; dfd.discretize(_df, discretizeProgress); _calculateClassCorrelations(classCorrelationProgress); _calculateCorrelationMatrix(correlationMatrixProgress); // cout << "class corr: " << _classCorr << endl; // cout << "corr matrix: " << endl; // for (unsigned int i = 0; i < _corrMatrix.size(); i++) // { // cout << _corrMatrix[i] << endl; // } _df.clear(); }
double FeatureScoreFitnessFunction::calculateFitness(const Genome& genome) { const CalculatorGenome& cg = dynamic_cast<const CalculatorGenome&>(genome); _workingCopy->setFactorType(0, DataFrame::Numerical); for (unsigned int i = 0; i < _workingCopy->getNumDataVectors(); i++) { _workingCopy->setDataElement(i, 0, cg.calculateValue(i)); } DataFrameDiscretizer dfd; try { dfd.discretize(*_workingCopy); } catch (const Tgs::Exception&) { // probably because of no valid values. return 0.0; } double score = _fsc->calculateScore(*_workingCopy, -1, *_workingCopy, 0); int nodeCount = cg.countNodes(); if (nodeCount > 10) { score *= pow(.95, nodeCount - 10); } return score; }
void ConsistencySubsetEvaluator::setDataFrame(const DataFrame& dataFrame, TgsProgress* progress) { _dataFrame = dataFrame; DataFrameDiscretizer dfd; dfd.discretize(_dataFrame, progress); _enumCnt = 0; for (unsigned int i = 0; i < _dataFrame.getNumDataVectors(); i++) { string className = _dataFrame.getTrainingLabel(i); if (_enumMap.find(className) == _enumMap.end()) { _enumMap[className] = _enumCnt++; } } _putDataIntoBins(); }
double CfsFitnessFunction::calculateFitness(const Genome& genome) { const CalculatorGenome& cg = dynamic_cast<const CalculatorGenome&>(genome); _workingCopy->setFactorType(_workingFactor, DataFrame::Numerical); double start = Time::getTime(); //cout << cg.toString() << endl; // one tile per 3 minute const double MIN_EVAL_PER_SECOND = 250000 / 180; for (unsigned int i = 0; i < _workingCopy->getNumDataVectors(); i++) { double elapsed = Time::getTime() - start; if (elapsed >= 0.5) { double speed = i / elapsed; cout << "speed " << speed << "\r"; if (speed < MIN_EVAL_PER_SECOND || elapsed >= 10) { //cout << endl << "Too slow: " << cg.toString() << endl; return -1e9; } cout.flush(); } double v; v = cg.calculateValue(_workingUids[i]); _workingCopy->setDataElement(i, _workingFactor, v); } DataFrameDiscretizer dfd; try { dfd.discretize(*_workingCopy); } catch (const DataFrameDiscretizer::AllNullsException&) { // because of no valid values. return -1e9; } CfsSubsetEvaluator cse; cse.setDataFrame(*_workingCopy); std::vector<int> v; v.resize(_workingCopy->getNumFactors()); for (unsigned int i = 0; i < _workingCopy->getNumFactors(); i++) { v[i] = i; } double score = cse.evaluateSubset(v) - _baseScore; int nodeCount = cg.countNodes(); // penalize for large trees const int MAX_DESIRABLE_SIZE = 10; if (nodeCount > MAX_DESIRABLE_SIZE) { double m = 1 - pow(nodeCount - MAX_DESIRABLE_SIZE, 1.4) * .05; //cout << nodeCount << ", " << m << ", " << score << ", " << score * m << endl; if (m <= 0) { score = -numeric_limits<double>::infinity(); } else if (score > 0.0) { score *= m; } else { score /= m; } } return score; }