double compute_cost (vector<Instance*> data, vector< vector<double> >& global_means, vector<vector<int> > k, vector<int> z, vector<double> lambdas, Lookups* tables, dist_func df, int FIX_DIM) {
    double lambda_global = lambdas[0];
    double lambda_local = lambdas[1];
    int num_global_means = global_means.size();
    int num_local_means = 0;
    int D = tables->nDocs;
    for (int d = 0; d < D; d++) 
        num_local_means += k[d].size();
    double global_penalty = lambda_global * get_num_global_means(k);
    double local_penalty = lambda_local * get_num_local_means(z, tables);

    int N = tables->nWords;
    vector< pair<int, int> > doc_lookup = *(tables->doc_lookup);
    vector<Instance*> temp_global_means (num_global_means, NULL);
    for (int p = 0; p < num_global_means; p ++) 
        temp_global_means[p] = vec2ins (global_means[p]);
    vector<int> global_asgn (N, 0);
    compute_assignment (global_asgn, k, z, tables);
    double loss = 0;
    for (int d = 0; d < D; d ++) 
        for (int i = doc_lookup[d].first; i < doc_lookup[d].second; i++) {
            double dist = df(data[i], temp_global_means[global_asgn[i]], FIX_DIM);
            loss += dist * dist;
        }
    for (int p = 0; p < num_global_means; p ++) 
        delete temp_global_means[p];

    double total = loss + global_penalty + local_penalty;
    cerr << "loss: " << loss 
        << ", global: " << global_penalty 
        << ", local: " << local_penalty 
        << ", total: " << total << endl;
    return total;
}
Beispiel #2
0
static void get_correspondences(struct string_list *a, struct string_list *b,
				int creation_factor)
{
	int n = a->nr + b->nr;
	int *cost, c, *a2b, *b2a;
	int i, j;

	ALLOC_ARRAY(cost, st_mult(n, n));
	ALLOC_ARRAY(a2b, n);
	ALLOC_ARRAY(b2a, n);

	for (i = 0; i < a->nr; i++) {
		struct patch_util *a_util = a->items[i].util;

		for (j = 0; j < b->nr; j++) {
			struct patch_util *b_util = b->items[j].util;

			if (a_util->matching == j)
				c = 0;
			else if (a_util->matching < 0 && b_util->matching < 0)
				c = diffsize(a_util->diff, b_util->diff);
			else
				c = COST_MAX;
			cost[i + n * j] = c;
		}

		c = a_util->matching < 0 ?
			a_util->diffsize * creation_factor / 100 : COST_MAX;
		for (j = b->nr; j < n; j++)
			cost[i + n * j] = c;
	}

	for (j = 0; j < b->nr; j++) {
		struct patch_util *util = b->items[j].util;

		c = util->matching < 0 ?
			util->diffsize * creation_factor / 100 : COST_MAX;
		for (i = a->nr; i < n; i++)
			cost[i + n * j] = c;
	}

	for (i = a->nr; i < n; i++)
		for (j = b->nr; j < n; j++)
			cost[i + n * j] = 0;

	compute_assignment(n, n, cost, a2b, b2a);

	for (i = 0; i < a->nr; i++)
		if (a2b[i] >= 0 && a2b[i] < b->nr) {
			struct patch_util *a_util = a->items[i].util;
			struct patch_util *b_util = b->items[a2b[i]].util;

			a_util->matching = a2b[i];
			b_util->matching = i;
		}

	free(cost);
	free(a2b);
	free(b2a);
}
double HDP_MEDOIDS (vector<Instance*>& data, vector< vector<double> >& means, Lookups* tables, vector<double> lambdas, dist_func df, int FIX_DIM) {
    // STEP ZERO: validate input and initialization
    int N = tables->nWords;
    int D = tables->nDocs;
    vector< pair<int, int> > doc_lookup = *(tables->doc_lookup);
    double lambda_global = lambdas[0];
    double lambda_local = lambdas[1];

    vector< vector<double> > global_means (1, vector<double>(FIX_DIM, 0.0));
    vector< vector<int> > k (D, vector<int>(1,0));  // global association
    vector<int> z (N, 0); // local assignment
    vector<int> global_asgn (N, 0); // global assignment

    // STEP ONE: a. set initial *global* medoid as global mean
    compute_assignment (global_asgn, k, z, tables);
    compute_means (data, global_asgn, FIX_DIM, global_means);

    double last_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM);
    double new_cost = last_cost;
    while (true) {
        // 4. for each point x_ij,
        for (int j = 0; j < D; j ++) {
            for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i++) {
                int num_global_means = global_means.size();
                vector<double> d_ij (num_global_means, 0.0);
                for (int p = 0; p < num_global_means; p ++) {
                    Instance* temp_ins = vec2ins(global_means[p]);
                    double euc_dist = df(data[i], temp_ins, FIX_DIM);
                    d_ij[p] = euc_dist * euc_dist;
                    delete temp_ins;
                }
                set<int> temp;
                for (int p = 0; p < num_global_means; p ++) temp.insert(p);
                int num_local_means = k[j].size();
                for (int q = 0; q < num_local_means; q ++) temp.erase(k[j][q]);
                set<int>::iterator it; 
                for (it=temp.begin(); it!=temp.end();++it) d_ij[*it] += lambda_local;
                int min_p = -1; double min_dij = INF;
                for (int p = 0; p < num_global_means; p ++) 
                    if (d_ij[p] < min_dij) {
                        min_p = p;
                        min_dij = d_ij[p];
                    }
                if (min_dij > lambda_global + lambda_local) {
                    z[i] = num_local_means; 
                    k[j].push_back(num_global_means);
                    vector<double> new_g(FIX_DIM, 0.0);
                    for (int f = 0; f < data[i]->fea.size(); f++)
                        new_g[data[i]->fea[f].first-1] = data[i]->fea[f].second;
                    global_means.push_back(new_g);
                    // cout << "global and local increment" << endl;
                } else {
                    bool c_exist = false;
                    for (int c = 0; c < num_local_means; c ++) 
                        if (k[j][c] == min_p) {
                            z[i] = c;
                            c_exist = true;
                            break;
                        }
                    if (!c_exist) {
                        z[i] = num_local_means;
                        k[j].push_back(min_p);
                       // cout << "local increment" << endl;
                    }
                }
            }
        }
        /*
        cout << "half..........." << endl;
        cout << "#global created: " << global_means.size() 
            << ", #global used: " << get_num_global_means(k);
            */
        new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM);
        // 5. for all local clusters,
        for (int j = 0; j < D; j ++) {
            int begin_i = doc_lookup[j].first;
            int end_i = doc_lookup[j].second;
            int doc_len = doc_lookup[j].second - doc_lookup[j].first;
            int num_local_means = k[j].size();

            // all local clusters are distinct to each other
            /*
            set<int> temp;
            for (int y = 0; y < num_local_means; y++)
                temp.insert(k[j][y]);
            cout << temp.size() << " ==? " << num_local_means << endl;
            assert (temp.size() == num_local_means);
            */

            // compute means of local clusters
            vector< vector<double> > local_means (num_local_means, vector<double>(FIX_DIM, 0.0));
            vector<int> local_asgn (z.begin()+begin_i, z.begin()+end_i);
            vector<Instance*> local_data (data.begin()+begin_i,data.begin()+end_i);
            compute_means (local_data, local_asgn, FIX_DIM, local_means);
            assert (num_local_means == local_means.size());

            // pre-compute instances for global means 
            int num_global_means = global_means.size();
            vector<Instance*> temp_global_means (num_global_means, NULL);
            for (int p = 0; p < num_global_means; p ++) 
                temp_global_means[p] = vec2ins (global_means[p]);

            // pre-compute instances for local means 
            vector<Instance*> temp_local_means (num_local_means, NULL);
            for (int c = 0; c < num_local_means; c ++) 
                temp_local_means[c] = vec2ins (local_means[c]);

            for (int c = 0; c < num_local_means; c++) {
                // compute distance of local clusters to each global cluster
                num_global_means = global_means.size();
                vector<double> d_jcp (num_global_means, 0.0);
                double sum_d_ijc = 0.0; 
                for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i ++) {
                    if (z[i] != c) continue;
                    double local_dist = df (data[i], temp_local_means[c], FIX_DIM);
                    sum_d_ijc += local_dist * local_dist;
                    for (int p = 0; p < num_global_means; p ++) {
                        double dist = df (data[i], temp_global_means[p], FIX_DIM);
                        d_jcp[p] += dist * dist;
                    }
                }
                int min_p = -1; double min_d_jcp = INF;
                for (int p = 0; p < num_global_means; p ++) 
                    if (d_jcp[p] < min_d_jcp) {
                        min_p = p;
                        min_d_jcp = d_jcp[p];
                    }
                assert (min_p >= 0);
                // cout << min_d_jcp << " " << lambda_global << " " << sum_d_ijc << endl;
                if (min_d_jcp > lambda_global + sum_d_ijc) {
                    global_means.push_back(local_means[c]); //  push mu_jc
                    temp_global_means.push_back(vec2ins (local_means[c]));
                    k[j][c] = num_global_means;
                    // cout << "global increment" << endl;
                } else {
                    k[j][c] = min_p;
                }
            }
            for (int c = 0; c < num_local_means; c ++) 
                delete temp_local_means[c];
            num_global_means = global_means.size();
            for (int p = 0; p < num_global_means; p ++) 
                delete temp_global_means[p];
        }
        // 6. for each global clusters,
        compute_assignment (global_asgn, k, z, tables);
        /*
        cout << "compute global means.." << endl;
        cout << "#global created: " << global_means.size() 
            << ", #global used: " << get_num_global_means(k);
            */
        compute_means (data, global_asgn, FIX_DIM, global_means);

        // 7. convergence?
        new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM);
        if ( new_cost < objmin ) objmin = new_cost;
        objmin_trace << omp_get_wtime()-start_time << " " << objmin << endl;
        if (new_cost == last_cost)
            break;
        if (new_cost < last_cost) {
            last_cost = new_cost;
        } else {
            cerr << "failure" << endl;
            return INF;
            assert(false);    
        }
    }
    means = global_means;
    return last_cost;
}// entry main function