bool SplitEvaluatorMLClass<Sample, TAppContext>::CalculateEntropyAndThreshold(DataSet<Sample, LabelMLClass>& dataset, std::vector<std::pair<double, int> > responses, std::pair<double, double>& score_and_threshold, int use_gini) { // In: samples, sorted responses, out: optimality-measure + threshold // Initialize the counters double DGini, LGini, RGini, LTotal = 0.0, RTotal = 0.0, bestThreshold = 0.0, bestDGini = 1e16; vector<double> LCount(m_appcontext->num_classes, 0.0), RCount(m_appcontext->num_classes, 0.0); bool found = false; // Calculate random thresholds and sort them double min_response = responses[0].first; double max_response = responses[responses.size()-1].first; double d = (max_response - min_response); vector<double> random_thresholds(m_appcontext->num_node_thresholds, 0.0); for (int i = 0; i < random_thresholds.size(); i++) { random_thresholds[i] = (randDouble() * d) + min_response; } sort(random_thresholds.begin(), random_thresholds.end()); // First, put everything in the right node for (int r = 0; r < responses.size(); r++) { int labelIdx = dataset[responses[r].second]->m_label.class_label; double sample_w = dataset[responses[r].second]->m_label.class_weight; RCount[labelIdx] += sample_w; RTotal += sample_w; } // Now, iterate all responses and calculate Gini indices at the cutoff points (thresholds) int th_idx = 0; bool stop_search = false; for (int r = 0; r < responses.size(); r++) { // if the current sample is smaller than the current threshold put it to the left side if (responses[r].first <= random_thresholds[th_idx]) { double cur_sample_weight = dataset[responses[r].second]->m_label.class_weight; RTotal -= cur_sample_weight; if (RTotal < 0.0) RTotal = 0.0; LTotal += cur_sample_weight; int labelIdx = dataset[responses[r].second]->m_label.class_label; RCount[labelIdx] -= cur_sample_weight; if (RCount[labelIdx] < 0.0) RCount[labelIdx] = 0.0; LCount[labelIdx] += cur_sample_weight; } else { // ok, now we found the first sample having higher response than the current threshold // now, we have to check the Gini index, this would be a valid split LGini = 0.0, RGini = 0.0; if (use_gini) { for (int c = 0; c < LCount.size(); c++) { double pL = LCount[c]/LTotal, pR = RCount[c]/RTotal; if (LCount[c] >= 1e-10) // F**K YOU rounding errors LGini += pL * (1.0 - pL); if (RCount[c] >= 1e-10) RGini += pR * (1.0 - pR); } } else { for (int c = 0; c < LCount.size(); c++) { double pL = LCount[c]/LTotal, pR = RCount[c]/RTotal; if (LCount[c] >= 1e-10) // F**K YOU rounding errors LGini -= pL * log(pL); if (RCount[c] >= 1e-10) RGini -= pR * log(pR); } } DGini = (LTotal*LGini + RTotal*RGini)/(LTotal + RTotal); if (DGini < bestDGini && LTotal > 0.0 && RTotal > 0.0) { bestDGini = DGini; bestThreshold = random_thresholds[th_idx]; found = true; } // next, we have to find the next random threshold that is larger than the current response // -> there might be several threshold within the gap between the last response and this one. while (responses[r].first > random_thresholds[th_idx]) { if (th_idx < (random_thresholds.size()-1)) { th_idx++; // CAUTION::: THIS HAS TO BE INCLUDED !!!!!!!!!!!?????? r--; // THIS IS IMPORTANT, WE HAVE TO CHECK THE CURRENT SAMPLE AGAIN!!! } else { stop_search = true; break; // all thresholds tested } } // now, we can go on with the next response ... } if (stop_search) break; } score_and_threshold.first = bestDGini; score_and_threshold.second = bestThreshold; return found; }
bool SplitEvaluatorMLRegr<Sample>::CalculateMVNPluginAndThreshold(DataSet<Sample, LabelMLRegr>& dataset, std::vector<std::pair<double, int> > responses, std::pair<double,double>& score_and_threshold) { // In: samples, sorted responses, out: optimality-measure + threshold // Initialize the variables and counters double InfoGain, LEntropy, REntropy, bestThreshold = 0.0, BestInfoGain = 1e16; double LTotal = 0.0, RTotal = 0.0, LSqNormTotal = 0.0, RSqNormTotal = 0.0; VectorXd RMean = VectorXd::Zero(m_appcontext->num_target_variables); VectorXd LMean = VectorXd::Zero(m_appcontext->num_target_variables); VectorXd RSum = VectorXd::Zero(m_appcontext->num_target_variables); VectorXd LSum = VectorXd::Zero(m_appcontext->num_target_variables); MatrixXd LCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables); MatrixXd RCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables); vector<int> RSamples, LSamples; bool found = false; // Calculate random thresholds and sort them double min_response = responses[0].first; double max_response = responses[responses.size()-1].first; double d = (max_response - min_response); vector<double> random_thresholds(m_appcontext->num_node_thresholds, 0.0); for (int i = 0; i < random_thresholds.size(); i++) random_thresholds[i] = (randDouble() * d) + min_response; sort(random_thresholds.begin(), random_thresholds.end()); // First, put everything in the right node RSamples.resize(responses.size()); for (int r = 0; r < responses.size(); r++) { double csw = dataset[responses[r].second]->m_weight; Eigen::VectorXd cst = dataset[responses[r].second]->m_label.regr_target; RSum += csw * cst; RTotal += csw; RSamples[r] = responses[r].second; } RMean = RSum / RTotal; // Now, iterate all responses and calculate Gini indices at the cutoff points (thresholds) int th_idx = 0; bool stop_search = false; for (int r = 0; r < responses.size(); r++) { // if the current sample is smaller than the current threshold put it to the left side if (responses[r].first <= random_thresholds[th_idx]) { // move the current response from the right node to the left node double csw = dataset[responses[r].second]->m_weight; Eigen::VectorXd cst = dataset[responses[r].second]->m_label.regr_target; RSum -= csw * cst; RTotal -= csw; if (RTotal < 0.0) RTotal = 0.0; LSum += csw * cst; LTotal += csw; LSamples.push_back(RSamples[0]); RSamples.erase(RSamples.begin()); } else { if (LTotal > 0.0 && RTotal > 0.0) { // RIGHT: Weighted mean RMean = RSum / RTotal; RCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables); RSqNormTotal = 0.0; for (int s = 0; s < RSamples.size(); s++) { Eigen::VectorXd cst = dataset[RSamples[s]]->m_label.regr_target; RCov += dataset[RSamples[s]]->m_weight * ((cst - RMean) * (cst - RMean).transpose()); RSqNormTotal += pow(dataset[RSamples[s]]->m_weight/RTotal, 2.0); } RCov /= RTotal; if (RSqNormTotal < 1.0) RCov /= (1.0 - RSqNormTotal); double RCovDet = RCov.determinant(); if (RCovDet <= 0.0) RCovDet = 1e-10; REntropy = log(RCovDet); if (REntropy <= 0.0) REntropy = 0.0; // LEFT: Weighted mean LMean = LSum / LTotal; // weighted co-variance LCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables); LSqNormTotal = 0.0; for (int s = 0; s < LSamples.size(); s++) { Eigen::VectorXd cst = dataset[LSamples[s]]->m_label.regr_target; LCov += dataset[LSamples[s]]->m_weight * ((cst - LMean) * (cst - LMean).transpose()); LSqNormTotal += pow(dataset[LSamples[s]]->m_weight/LTotal, 2.0); } if (LSamples.size() == 0) { cout << LCov << endl; cout << LSqNormTotal << endl; } LCov /= LTotal; if (LSqNormTotal < 1.0) LCov /= (1.0 - LSqNormTotal); double LCovDet = LCov.determinant(); if (LCovDet <= 0.0) LCovDet = 1e-10; LEntropy = log(LCovDet); if (LEntropy <= 0.0) LEntropy = 0.0; // combine left and right entropy measures (weighted!!!) InfoGain = (LTotal*LEntropy + RTotal*REntropy) / (LTotal + RTotal); if (this->m_appcontext->debug_on) cout << "Eval: " << InfoGain << ", LTotal=" << LTotal << ", RTotal=" << RTotal << "(" << LEntropy << ", " << REntropy << ")" << endl; if (InfoGain < BestInfoGain) { BestInfoGain = InfoGain; bestThreshold = random_thresholds[th_idx]; found = true; } } // next, we have to find the next random threshold that is larger than the current response // -> there might be several threshold within the gap between the last response and this one. while (responses[r].first > random_thresholds[th_idx]) { if (th_idx < (random_thresholds.size()-1)) { th_idx++; r--; } else { stop_search = true; break; } } // now, we can go on with the next response ... } if (stop_search) break; } score_and_threshold.first = BestInfoGain; score_and_threshold.second = bestThreshold; return found; }
bool SplitEvaluatorMLClass<Sample, TAppContext>::CalculateSpecificLossAndThreshold(DataSet<Sample, LabelMLClass>& dataset, std::vector<std::pair<double, int> > responses, std::pair<double, double>& score_and_threshold) { // In: samples, sorted responses, out:loss-value+threshold // 1) Calculate random thresholds and sort them double min_response = responses[0].first; double max_response = responses[responses.size()-1].first; double d = (max_response - min_response); vector<double> random_thresholds(m_appcontext->num_node_thresholds, 0.0); for (int i = 0; i < random_thresholds.size(); i++) random_thresholds[i] = (randDouble() * d) + min_response; sort(random_thresholds.begin(), random_thresholds.end()); // Declare and init some variables vector<double> RClassWeights(m_appcontext->num_classes, 0.0); vector<double> LClassWeights(m_appcontext->num_classes, 0.0); vector<int> RSamples; vector<int> LSamples; double RTotalWeight = 0.0; double LTotalWeight = 0.0; double margin = 0.0; double RLoss = 0.0, LLoss = 0.0; double BestLoss = 1e16, CombinedLoss = 0.0, TotalWeight = 0.0, BestThreshold = 0.0; bool found = false; // First, put everything in the right node RSamples.resize(responses.size()); for (int r = 0; r < responses.size(); r++) { int labelIdx = dataset[responses[r].second]->m_label.class_label; double sample_w = dataset[responses[r].second]->m_label.class_weight; RClassWeights[labelIdx] += sample_w; RTotalWeight += sample_w; RSamples[r] = responses[r].second; } // Now, iterate all responses and calculate Gini indices at the cutoff points (thresholds) int th_idx = 0; bool stop_search = false; for (int r = 0; r < responses.size(); r++) { // if the current sample is smaller than the current threshold put it to the left side if (responses[r].first <= random_thresholds[th_idx]) { int labelIdx = dataset[responses[r].second]->m_label.class_label; double cur_sample_weight = dataset[responses[r].second]->m_label.class_weight; RClassWeights[labelIdx] -= cur_sample_weight; if (RClassWeights[labelIdx] < 0.0) RClassWeights[labelIdx] = 0.0; LClassWeights[labelIdx] += cur_sample_weight; RTotalWeight -= cur_sample_weight; if (RTotalWeight < 0.0) RTotalWeight = 0.0; LTotalWeight += cur_sample_weight; LSamples.push_back(RSamples[0]); RSamples.erase(RSamples.begin()); } else { // ok, now we found the first sample having higher response than the current threshold // Reset the losses RLoss = 0.0, LLoss = 0.0; // calculate loss for left and right child nodes // RIGHT vector<double> pR(RClassWeights.size()); for (int ci = 0; ci < RClassWeights.size(); ci++) pR[ci] = RClassWeights[ci] / RTotalWeight; for (int ci = 0; ci < RClassWeights.size(); ci++) RLoss += RClassWeights[ci] * ComputeLoss(pR, ci, m_appcontext->global_loss_classification); // LEFT vector<double> pL(LClassWeights.size()); for (int ci = 0; ci < LClassWeights.size(); ci++) pL[ci] = LClassWeights[ci] / LTotalWeight; for (int ci = 0; ci < LClassWeights.size(); ci++) LLoss += LClassWeights[ci] * ComputeLoss(pL, ci, m_appcontext->global_loss_classification); // Total loss CombinedLoss = LLoss + RLoss; // best-search ... if (CombinedLoss < BestLoss && LTotalWeight > 0.0 && RTotalWeight > 0.0) { BestLoss = CombinedLoss; BestThreshold = random_thresholds[th_idx]; found = true; } // next, we have to find the next random threshold that is larger than the current response // -> there might be several threshold within the gap between the last response and this one. while (responses[r].first > random_thresholds[th_idx]) { if (th_idx < (random_thresholds.size()-1)) { th_idx++; r--; } else { stop_search = true; break; // all thresholds tested } } // now, we can go on with the next response ... } if (stop_search) break; } score_and_threshold.first = BestLoss; score_and_threshold.second = BestThreshold; return found; }