Ejemplo n.º 1
0
bool SplitEvaluatorMLClass<Sample, TAppContext>::CalculateEntropyAndThreshold(DataSet<Sample, LabelMLClass>& dataset, std::vector<std::pair<double, int> > responses, std::pair<double, double>& score_and_threshold, int use_gini)
{
	// In: samples, sorted responses, out: optimality-measure + threshold

    // Initialize the counters
    double DGini, LGini, RGini, LTotal = 0.0, RTotal = 0.0, bestThreshold = 0.0, bestDGini = 1e16;
    vector<double> LCount(m_appcontext->num_classes, 0.0), RCount(m_appcontext->num_classes, 0.0);
    bool found = false;

    // Calculate random thresholds and sort them
    double min_response = responses[0].first;
    double max_response = responses[responses.size()-1].first;
    double d = (max_response - min_response);
    vector<double> random_thresholds(m_appcontext->num_node_thresholds, 0.0);
    for (int i = 0; i < random_thresholds.size(); i++)
    {
        random_thresholds[i] = (randDouble() * d) + min_response;
    }
    sort(random_thresholds.begin(), random_thresholds.end());

    // First, put everything in the right node
    for (int r = 0; r < responses.size(); r++)
    {
    	int labelIdx = dataset[responses[r].second]->m_label.class_label;
    	double sample_w = dataset[responses[r].second]->m_label.class_weight;

    	RCount[labelIdx] += sample_w;
        RTotal += sample_w;
    }

    // Now, iterate all responses and calculate Gini indices at the cutoff points (thresholds)
    int th_idx = 0;
    bool stop_search = false;
    for (int r = 0; r < responses.size(); r++)
    {
        // if the current sample is smaller than the current threshold put it to the left side
        if (responses[r].first <= random_thresholds[th_idx])
        {
            double cur_sample_weight = dataset[responses[r].second]->m_label.class_weight;

            RTotal -= cur_sample_weight;
            if (RTotal < 0.0)
            	RTotal = 0.0;
            LTotal += cur_sample_weight;
            int labelIdx = dataset[responses[r].second]->m_label.class_label;
            RCount[labelIdx] -= cur_sample_weight;
            if (RCount[labelIdx] < 0.0)
            	RCount[labelIdx] = 0.0;
            LCount[labelIdx] += cur_sample_weight;
        }
        else
        {
            // ok, now we found the first sample having higher response than the current threshold

            // now, we have to check the Gini index, this would be a valid split
            LGini = 0.0, RGini = 0.0;
            if (use_gini)
            {
                for (int c = 0; c < LCount.size(); c++)
                {
                    double pL = LCount[c]/LTotal, pR = RCount[c]/RTotal;
                    if (LCount[c] >= 1e-10) // F**K YOU rounding errors
                        LGini += pL * (1.0 - pL);
                    if (RCount[c] >= 1e-10)
                        RGini += pR * (1.0 - pR);
                }
            }
            else
            {
                for (int c = 0; c < LCount.size(); c++)
                {
                    double pL = LCount[c]/LTotal, pR = RCount[c]/RTotal;
                    if (LCount[c] >= 1e-10) // F**K YOU rounding errors
                        LGini -= pL * log(pL);
                    if (RCount[c] >= 1e-10)
                        RGini -= pR * log(pR);
                }
            }
            DGini = (LTotal*LGini + RTotal*RGini)/(LTotal + RTotal);

            if (DGini < bestDGini && LTotal > 0.0 && RTotal > 0.0)
            {
                bestDGini = DGini;
                bestThreshold = random_thresholds[th_idx];
                found = true;
            }

            // next, we have to find the next random threshold that is larger than the current response
            // -> there might be several threshold within the gap between the last response and this one.
            while (responses[r].first > random_thresholds[th_idx])
            {
                if (th_idx < (random_thresholds.size()-1))
                {
                    th_idx++;
                    // CAUTION::: THIS HAS TO BE INCLUDED !!!!!!!!!!!??????
                    r--; // THIS IS IMPORTANT, WE HAVE TO CHECK THE CURRENT SAMPLE AGAIN!!!
                }
                else
                {
                    stop_search = true;
                    break; // all thresholds tested
                }
            }
            // now, we can go on with the next response ...
        }

        if (stop_search)
            break;
    }

    score_and_threshold.first = bestDGini;
    score_and_threshold.second = bestThreshold;
    return found;
}
Ejemplo n.º 2
0
bool SplitEvaluatorMLRegr<Sample>::CalculateMVNPluginAndThreshold(DataSet<Sample, LabelMLRegr>& dataset, std::vector<std::pair<double, int> > responses, std::pair<double,double>& score_and_threshold)
{
	// In: samples, sorted responses, out: optimality-measure + threshold

	// Initialize the variables and counters
	double InfoGain, LEntropy, REntropy, bestThreshold = 0.0, BestInfoGain = 1e16;
	double LTotal = 0.0, RTotal = 0.0, LSqNormTotal = 0.0, RSqNormTotal = 0.0;
	VectorXd RMean = VectorXd::Zero(m_appcontext->num_target_variables);
	VectorXd LMean = VectorXd::Zero(m_appcontext->num_target_variables);
	VectorXd RSum = VectorXd::Zero(m_appcontext->num_target_variables);
	VectorXd LSum = VectorXd::Zero(m_appcontext->num_target_variables);
	MatrixXd LCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables);
	MatrixXd RCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables);
	vector<int> RSamples, LSamples;
	bool found = false;

	// Calculate random thresholds and sort them
	double min_response = responses[0].first;
	double max_response = responses[responses.size()-1].first;
	double d = (max_response - min_response);
	vector<double> random_thresholds(m_appcontext->num_node_thresholds, 0.0);
	for (int i = 0; i < random_thresholds.size(); i++)
		random_thresholds[i] = (randDouble() * d) + min_response;
	sort(random_thresholds.begin(), random_thresholds.end());

	// First, put everything in the right node
	RSamples.resize(responses.size());
	for (int r = 0; r < responses.size(); r++)
	{
		double csw = dataset[responses[r].second]->m_weight;
		Eigen::VectorXd cst = dataset[responses[r].second]->m_label.regr_target;
		RSum += csw * cst;
		RTotal += csw;
		RSamples[r] = responses[r].second;
	}
	RMean = RSum / RTotal;

	// Now, iterate all responses and calculate Gini indices at the cutoff points (thresholds)
	int th_idx = 0;
	bool stop_search = false;
	for (int r = 0; r < responses.size(); r++)
	{
		// if the current sample is smaller than the current threshold put it to the left side
		if (responses[r].first <= random_thresholds[th_idx])
		{
			// move the current response from the right node to the left node
			double csw = dataset[responses[r].second]->m_weight;
			Eigen::VectorXd cst = dataset[responses[r].second]->m_label.regr_target;
			RSum -= csw * cst;
			RTotal -= csw;
			if (RTotal < 0.0)
				RTotal = 0.0;
			LSum += csw * cst;
			LTotal += csw;
			LSamples.push_back(RSamples[0]);
			RSamples.erase(RSamples.begin());
		}
		else
		{
			if (LTotal > 0.0 && RTotal > 0.0)
			{
				// RIGHT: Weighted mean
				RMean = RSum / RTotal;
				RCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables);
				RSqNormTotal = 0.0;
				for (int s = 0; s < RSamples.size(); s++)
				{
					Eigen::VectorXd cst = dataset[RSamples[s]]->m_label.regr_target;
					RCov += dataset[RSamples[s]]->m_weight * ((cst - RMean) * (cst - RMean).transpose());
					RSqNormTotal += pow(dataset[RSamples[s]]->m_weight/RTotal, 2.0);
				}
				RCov /= RTotal;
				if (RSqNormTotal < 1.0)
					RCov /= (1.0 - RSqNormTotal);
				double RCovDet = RCov.determinant();
				if (RCovDet <= 0.0)
					RCovDet = 1e-10;
				REntropy = log(RCovDet);
				if (REntropy <= 0.0)
					REntropy = 0.0;

				// LEFT: Weighted mean
				LMean = LSum / LTotal;

				// weighted co-variance
				LCov = MatrixXd::Zero(m_appcontext->num_target_variables, m_appcontext->num_target_variables);
				LSqNormTotal = 0.0;
				for (int s = 0; s < LSamples.size(); s++)
				{
					Eigen::VectorXd cst = dataset[LSamples[s]]->m_label.regr_target;
					LCov += dataset[LSamples[s]]->m_weight * ((cst - LMean) * (cst - LMean).transpose());
					LSqNormTotal += pow(dataset[LSamples[s]]->m_weight/LTotal, 2.0);
				}
				if (LSamples.size() == 0)
				{
					cout << LCov << endl;
					cout << LSqNormTotal << endl;
				}
				LCov /= LTotal;
				if (LSqNormTotal < 1.0)
					LCov /= (1.0 - LSqNormTotal);
				double LCovDet = LCov.determinant();
				if (LCovDet <= 0.0)
					LCovDet = 1e-10;
				LEntropy = log(LCovDet);
				if (LEntropy <= 0.0)
					LEntropy = 0.0;

				// combine left and right entropy measures (weighted!!!)
				InfoGain = (LTotal*LEntropy + RTotal*REntropy) / (LTotal + RTotal);

				if (this->m_appcontext->debug_on)
					cout << "Eval: " << InfoGain << ", LTotal=" << LTotal << ", RTotal=" << RTotal << "(" << LEntropy << ", " << REntropy << ")" << endl;

				if (InfoGain < BestInfoGain)
				{
					BestInfoGain = InfoGain;
					bestThreshold = random_thresholds[th_idx];
					found = true;
				}
			}

			// next, we have to find the next random threshold that is larger than the current response
			// -> there might be several threshold within the gap between the last response and this one.
			while (responses[r].first > random_thresholds[th_idx])
			{
				if (th_idx < (random_thresholds.size()-1))
				{
					th_idx++;
					r--;
				}
				else
				{
					stop_search = true;
					break;
				}
			}
			// now, we can go on with the next response ...
		}
		if (stop_search)
			break;
	}

	score_and_threshold.first = BestInfoGain;
	score_and_threshold.second = bestThreshold;
	return found;
}
Ejemplo n.º 3
0
bool SplitEvaluatorMLClass<Sample, TAppContext>::CalculateSpecificLossAndThreshold(DataSet<Sample, LabelMLClass>& dataset, std::vector<std::pair<double, int> > responses, std::pair<double, double>& score_and_threshold)
{
	// In: samples, sorted responses, out:loss-value+threshold

    // 1) Calculate random thresholds and sort them
    double min_response = responses[0].first;
    double max_response = responses[responses.size()-1].first;
    double d = (max_response - min_response);
    vector<double> random_thresholds(m_appcontext->num_node_thresholds, 0.0);
    for (int i = 0; i < random_thresholds.size(); i++)
        random_thresholds[i] = (randDouble() * d) + min_response;
    sort(random_thresholds.begin(), random_thresholds.end());


    // Declare and init some variables
    vector<double> RClassWeights(m_appcontext->num_classes, 0.0);
    vector<double> LClassWeights(m_appcontext->num_classes, 0.0);
    vector<int> RSamples;
    vector<int> LSamples;
    double RTotalWeight = 0.0;
    double LTotalWeight = 0.0;
    double margin = 0.0;
    double RLoss = 0.0, LLoss = 0.0;
    double BestLoss = 1e16, CombinedLoss = 0.0, TotalWeight = 0.0, BestThreshold = 0.0;
    bool found = false;


    // First, put everything in the right node
    RSamples.resize(responses.size());
    for (int r = 0; r < responses.size(); r++)
    {
        int labelIdx = dataset[responses[r].second]->m_label.class_label;
        double sample_w = dataset[responses[r].second]->m_label.class_weight;

        RClassWeights[labelIdx] += sample_w;
        RTotalWeight += sample_w;
        RSamples[r] = responses[r].second;
    }

    // Now, iterate all responses and calculate Gini indices at the cutoff points (thresholds)
    int th_idx = 0;
    bool stop_search = false;
    for (int r = 0; r < responses.size(); r++)
    {
        // if the current sample is smaller than the current threshold put it to the left side
        if (responses[r].first <= random_thresholds[th_idx])
        {
            int labelIdx = dataset[responses[r].second]->m_label.class_label;
            double cur_sample_weight = dataset[responses[r].second]->m_label.class_weight;

            RClassWeights[labelIdx] -= cur_sample_weight;
            if (RClassWeights[labelIdx] < 0.0)
                RClassWeights[labelIdx] = 0.0;
            LClassWeights[labelIdx] += cur_sample_weight;

            RTotalWeight -= cur_sample_weight;
            if (RTotalWeight < 0.0)
                RTotalWeight = 0.0;
            LTotalWeight += cur_sample_weight;

            LSamples.push_back(RSamples[0]);
            RSamples.erase(RSamples.begin());
        }
        else
        {
            // ok, now we found the first sample having higher response than the current threshold

            // Reset the losses
            RLoss = 0.0, LLoss = 0.0;

            // calculate loss for left and right child nodes
            // RIGHT
			vector<double> pR(RClassWeights.size());
			for (int ci = 0; ci < RClassWeights.size(); ci++)
				pR[ci] = RClassWeights[ci] / RTotalWeight;
			for (int ci = 0; ci < RClassWeights.size(); ci++)
				RLoss += RClassWeights[ci] * ComputeLoss(pR, ci, m_appcontext->global_loss_classification);

            // LEFT
            vector<double> pL(LClassWeights.size());
			for (int ci = 0; ci < LClassWeights.size(); ci++)
				pL[ci] = LClassWeights[ci] / LTotalWeight;
			for (int ci = 0; ci < LClassWeights.size(); ci++)
				LLoss += LClassWeights[ci] * ComputeLoss(pL, ci, m_appcontext->global_loss_classification);

            // Total loss
            CombinedLoss = LLoss + RLoss;

            // best-search ...
            if (CombinedLoss < BestLoss && LTotalWeight > 0.0 && RTotalWeight > 0.0)
            {
                BestLoss = CombinedLoss;
                BestThreshold = random_thresholds[th_idx];
                found = true;
            }

            // next, we have to find the next random threshold that is larger than the current response
            // -> there might be several threshold within the gap between the last response and this one.
            while (responses[r].first > random_thresholds[th_idx])
            {
                if (th_idx < (random_thresholds.size()-1))
                {
                    th_idx++;
                    r--;
                }
                else
                {
                    stop_search = true;
                    break; // all thresholds tested
                }
            }
            // now, we can go on with the next response ...
        }
        if (stop_search)
            break;
    }

    score_and_threshold.first = BestLoss;
    score_and_threshold.second = BestThreshold;
    return found;
}