// Find adjacent intervals with smallest $\chi^2$ IntervalList::iterator find_min_chi_chi() { // Compute $\chi^2$ value for each adjacent interval, // keeping track of minimum value IntervalList::iterator lit = g_intervals.begin(), lend = g_intervals.end(); IntervalList::iterator next = lit, min_lit = lend; float min_chisquared = 1e6; bool first = true; for ( ; lit != lend; ++lit) { next = lit; ++next; if (next == lend) break; float chisquared = compute_chisquared(*lit, *next); // Debug print_interval_set(*lit); cout << "\\chi^2 = " << chisquared << endl; if (first || chisquared < min_chisquared) { min_lit = lit; min_chisquared = chisquared; } first = false; } cout << "min_chisquared = " << min_chisquared << endl; return min_lit; }
void print_interval_summary(ostream& os, int dimIndex) { IntervalList::iterator lit = g_intervals.begin(), lend = g_intervals.end(); IntervalList::iterator next = lit; vector<DataType> split_points; os << "\n" "Feature " << (dimIndex+1) << ":" << endl; os << "Ranges: "; for ( ; lit != lend; ++lit) { next = lit; ++next; pair<DataType, DataType> range, next_range; range = get_range(*lit, dimIndex); if (next != lend) { next_range = get_range(*next, dimIndex); float average = (range.second + next_range.first) * 0.5f; split_points.push_back(average); } os << "[" << range.first << ", " << range.second << "] "; } os << endl; os << "Split points: "; copy(split_points.begin(), split_points.end(), ostream_iterator<DataType>(os, ", ")); os << endl; }
// Debugging -- print all intervals void print_all_intervals() { IntervalList::iterator lit = g_intervals.begin(), lend = g_intervals.end(); cout << "[intervals]" << endl; for ( ; lit != lend; ++lit) { IntervalSet& indices = *lit; print_interval_set(indices); } }
// $\chi^2$ just for one dimension void chi_chi_dim_analysis(int dimIndex) { // Sort and initialize one interval per unique attribute value sort(g_data.begin(), g_data.end(), tuple_less_than<Tuple>(dimIndex)); TupleVec::iterator tit = g_data.begin(), tend = g_data.end(); cout << "[sort]" << endl; int index = 0; for ( ; tit != tend; ++tit, ++index) { cout << index << ":"; copy(tit->first.begin(), tit->first.end(), ostream_iterator<float>(cout, ", ")); cout << tit->second << endl; } initialize_intervals(dimIndex); // Count instances of all classes count_classes(); while ((int)g_intervals.size() > g_max_intervals) { // Find adjacent intervals with smallest $\chi^2$ IntervalList::iterator min_lit = find_min_chi_chi(); assert(min_lit != g_intervals.end()); IntervalList::iterator min_lit_next = min_lit; ++min_lit_next; cout << "[before merge] "; print_all_intervals(); // Merge IntervalSet& interval_1 = *min_lit; IntervalSet& interval_2 = *min_lit_next; interval_1.insert(interval_2.begin(), interval_2.end()); g_intervals.erase(min_lit_next); cout << "[after merge] "; print_all_intervals(); } // Debugging print_interval_summary(cout, dimIndex); // Logged output print_interval_summary(olog, dimIndex); }