void Qjets::ComputeNewDistanceMeasures(fastjet::ClusterSequence & cs, int new_jet){ // jet-jet distances for(unsigned int i = 0; i < cs.jets().size(); i++) if(JetUnmerged(i) && i != (unsigned int) new_jet){ jet_distance jd; jd.j1 = new_jet; jd.j2 = i; jd.dij = d_ij(cs.jets()[jd.j1], cs.jets()[jd.j2]); _distances.push_back(jd); } }
void Qjets::ComputeAllDistances(const vector<fastjet::PseudoJet>& inp){ for(unsigned int i = 0 ; i < inp.size()-1; i++){ // jet-jet distances for(unsigned int j = i+1 ; j < inp.size(); j++){ jet_distance jd; jd.j1 = i; jd.j2 = j; if(jd.j1 != jd.j2){ jd.dij = d_ij(inp[i],inp[j]); _distances.push_back(jd); } } } }
void hmm::hmm_algo(int noIterations){ WordIndex i, j, jj, l, m; SentPair sent; time_t a_time1, a_time2, b_time1, b_time2, y_time1, y_time2, k_time1, k_time2, d_time1, d_time2, e_time1, e_time2; double a_time, b_time, y_time, k_time, d_time, e_time; for(int it=1;it <= noIterations; it++){ cout<<"hidden markov iteration ("<<it<<")"<<endl; cout<<"---------------------------------------"<<endl; //Initialization cout<<"...........Initializing..........."<<endl; sHander.new_start(); while(sHander.getNextSentence(sent)){ vector<WordIndex>& es = sent.esent; vector<WordIndex>& fs = sent.fsent; l = es.size() - 1; m = fs.size() - 1; for(j=0;j <= l;j++){ count_e[es[j]] = 0; count_jl[j*G1+l] = 0; for(i=1;i <= m;i++) cal_ef[WordPairIds(es[j], fs[i])].count = 0; for(jj=0;jj <= l;jj++) count_jj_jl[jj*G2+j*G1+l] = 0; } } a_time = 0;b_time = 0;y_time = 0;k_time = 0;d_time = 0;e_time = 0; //backward-forward learning cout<<"...........backward-forward learning..........."<<endl; sHander.new_start(); while(sHander.getNextSentence(sent)){ vector<WordIndex>& es = sent.esent; vector<WordIndex>& fs = sent.fsent; l = es.size() - 1; m = fs.size() - 1; double uniform = 1.0/(double)(l+1); double temp_i, temp_max; time(&a_time1); //learning alpha parameters array2<double> alpha(m+1, l+1); for(j=0;j <= l;j++) alpha(1,j) = uniform * cal_ef[WordPairIds(es[j], fs[1])].prob; for(i=1;i <= m-1;i++){ for(jj=0;jj <= l;jj++){ temp_max = alpha(i, 0) * p_jj_jl[jj*G2+0*G1+l]; for(j=1;j <= l;j++){ temp_i = alpha(i, j) * p_jj_jl[jj*G2+j*G1+l]; if(temp_i > temp_max) temp_max = temp_i; } alpha(i+1, jj) = temp_max * cal_ef[WordPairIds(es[jj], fs[i+1])].prob; } } time(&a_time2); a_time += difftime(a_time2, a_time1); time(&b_time1); //learning beta parameters array2<double> beta(m+1, l+1); for(j=0;j <= l;j++) beta(m, j) = 1.0; for(i=m;i >= 2;i--){ for(j=0;j <= l;j++){ temp_max = beta(i, 0) * cal_ef[WordPairIds(es[0], fs[i])].prob * p_jj_jl[0*G2+j*G1+l]; for(jj=1;jj <= l;jj++){ temp_i = beta(i, jj) * cal_ef[WordPairIds(es[jj], fs[i])].prob * p_jj_jl[jj*G2+j*G1+l]; if(temp_i > temp_max) temp_max = temp_i; } beta(i-1, j) = temp_max; } } time(&b_time2); b_time += difftime(b_time2, b_time1); time(&y_time1); //learning yita parameters array2<double> yita(m+1, l+1); for(i=1;i <= m;i++){ double sum_yita = 0.0; for(j=0;j <= l;j++) sum_yita += alpha(i, j) * beta(i, j); for(j=0;j <= l;j++) yita(i, j) = alpha(i, j) * beta(i, j) / sum_yita; } time(&y_time2); y_time += difftime(y_time2, y_time1); time(&k_time1); //learning kesi parameters map<WordIndex, double> kesi; for(i=1;i <= m-1;i++){ double sum_kesi = 0.0; for(j=0;j <= l;j++) for(jj=0;jj <= l;jj++) sum_kesi += alpha(i, j) * p_jj_jl[jj*G2+j*G1+l] * cal_ef[WordPairIds(es[jj], fs[i+1])].prob * beta(i+1, jj); for(j=0;j <= l;j++) for(jj=0;jj <= l;jj++) kesi[i*G2+j*G1+jj] = alpha(i, j) * p_jj_jl[jj*G2+j*G1+l] * cal_ef[WordPairIds(es[jj], fs[i+1])].prob * beta(i+1, jj) / sum_kesi; } time(&k_time2); k_time += difftime(k_time2, k_time1); time(&d_time1); //calculate d_jj_jl, d_ij; array2<double> d_jj_jl(l+1, l+1); for(j=0;j <= l;j++){ for(jj=0;jj <= l;jj++){ double sum_nume = 0, sum_deno = 0; for(i=1;i <= m-1;i++){ sum_nume += kesi[i*G2+j*G1+jj]; sum_deno += yita(i, j); } d_jj_jl(jj, j) = sum_nume / sum_deno; } } array2<double> d_ij(m+1, l+1, 0.0); for(j=0;j <= l;j++){ double sum_deno = 0; for(i=1;i <= m;i++) sum_deno += yita(i, j); for(i=1;i <= m;i++) d_ij(i, j) = yita(i, j) / sum_deno; } time(&d_time2); d_time += difftime(d_time2, d_time1); time(&e_time1); //em algorithm calculation count for(j=0;j <= l;j++){ for(jj=0;jj <= l;jj++){ count_jj_jl[jj*G2+j*G1+l] += d_jj_jl(jj, j); count_jl[j*G1+l] += d_jj_jl(jj, j); } } for(i=1;i <= m;i++){ for(j=0;j <= l;j++){ cal_ef[WordPairIds(es[j], fs[i])].count += d_ij(i, j); count_e[es[j]] += d_ij(i, j); } } time(&e_time2); e_time += difftime(e_time2, e_time1); }//end of backward-forward learning printf("alpha time:%.4fs beta time:%.4fs yita time:%.4fs kesi time:%.4fs d time:%.4fs em time:%.4fs", a_time, b_time, y_time, k_time, d_time, e_time); //calculate new probability cal_ef and p_jj_jl cout<<"...........calculate new probability..........."<<endl; sHander.new_start(); while(sHander.getNextSentence(sent)){ vector<WordIndex>& es = sent.esent; vector<WordIndex>& fs = sent.fsent; l = es.size() - 1; m = fs.size() - 1; for(j=0;j <= l;j++){ for(i=1;i <= m;i++) cal_ef[WordPairIds(es[j], fs[i])].prob = cal_ef[WordPairIds(es[j], fs[i])].count / count_e[es[j]]; for(jj=0;jj <= l;jj++) p_jj_jl[jj*G2+j*G1+l] = count_jj_jl[jj*G2+j*G1+l] / count_jl[j*G1+l]; } }//end of update probability cal_ed and p_jj_jl }//end of Iterations }//end of hmm_algo
double HDP_MEDOIDS (vector<Instance*>& data, vector< vector<double> >& means, Lookups* tables, vector<double> lambdas, dist_func df, int FIX_DIM) { // STEP ZERO: validate input and initialization int N = tables->nWords; int D = tables->nDocs; vector< pair<int, int> > doc_lookup = *(tables->doc_lookup); double lambda_global = lambdas[0]; double lambda_local = lambdas[1]; vector< vector<double> > global_means (1, vector<double>(FIX_DIM, 0.0)); vector< vector<int> > k (D, vector<int>(1,0)); // global association vector<int> z (N, 0); // local assignment vector<int> global_asgn (N, 0); // global assignment // STEP ONE: a. set initial *global* medoid as global mean compute_assignment (global_asgn, k, z, tables); compute_means (data, global_asgn, FIX_DIM, global_means); double last_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM); double new_cost = last_cost; while (true) { // 4. for each point x_ij, for (int j = 0; j < D; j ++) { for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i++) { int num_global_means = global_means.size(); vector<double> d_ij (num_global_means, 0.0); for (int p = 0; p < num_global_means; p ++) { Instance* temp_ins = vec2ins(global_means[p]); double euc_dist = df(data[i], temp_ins, FIX_DIM); d_ij[p] = euc_dist * euc_dist; delete temp_ins; } set<int> temp; for (int p = 0; p < num_global_means; p ++) temp.insert(p); int num_local_means = k[j].size(); for (int q = 0; q < num_local_means; q ++) temp.erase(k[j][q]); set<int>::iterator it; for (it=temp.begin(); it!=temp.end();++it) d_ij[*it] += lambda_local; int min_p = -1; double min_dij = INF; for (int p = 0; p < num_global_means; p ++) if (d_ij[p] < min_dij) { min_p = p; min_dij = d_ij[p]; } if (min_dij > lambda_global + lambda_local) { z[i] = num_local_means; k[j].push_back(num_global_means); vector<double> new_g(FIX_DIM, 0.0); for (int f = 0; f < data[i]->fea.size(); f++) new_g[data[i]->fea[f].first-1] = data[i]->fea[f].second; global_means.push_back(new_g); // cout << "global and local increment" << endl; } else { bool c_exist = false; for (int c = 0; c < num_local_means; c ++) if (k[j][c] == min_p) { z[i] = c; c_exist = true; break; } if (!c_exist) { z[i] = num_local_means; k[j].push_back(min_p); // cout << "local increment" << endl; } } } } /* cout << "half..........." << endl; cout << "#global created: " << global_means.size() << ", #global used: " << get_num_global_means(k); */ new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM); // 5. for all local clusters, for (int j = 0; j < D; j ++) { int begin_i = doc_lookup[j].first; int end_i = doc_lookup[j].second; int doc_len = doc_lookup[j].second - doc_lookup[j].first; int num_local_means = k[j].size(); // all local clusters are distinct to each other /* set<int> temp; for (int y = 0; y < num_local_means; y++) temp.insert(k[j][y]); cout << temp.size() << " ==? " << num_local_means << endl; assert (temp.size() == num_local_means); */ // compute means of local clusters vector< vector<double> > local_means (num_local_means, vector<double>(FIX_DIM, 0.0)); vector<int> local_asgn (z.begin()+begin_i, z.begin()+end_i); vector<Instance*> local_data (data.begin()+begin_i,data.begin()+end_i); compute_means (local_data, local_asgn, FIX_DIM, local_means); assert (num_local_means == local_means.size()); // pre-compute instances for global means int num_global_means = global_means.size(); vector<Instance*> temp_global_means (num_global_means, NULL); for (int p = 0; p < num_global_means; p ++) temp_global_means[p] = vec2ins (global_means[p]); // pre-compute instances for local means vector<Instance*> temp_local_means (num_local_means, NULL); for (int c = 0; c < num_local_means; c ++) temp_local_means[c] = vec2ins (local_means[c]); for (int c = 0; c < num_local_means; c++) { // compute distance of local clusters to each global cluster num_global_means = global_means.size(); vector<double> d_jcp (num_global_means, 0.0); double sum_d_ijc = 0.0; for (int i = doc_lookup[j].first; i < doc_lookup[j].second; i ++) { if (z[i] != c) continue; double local_dist = df (data[i], temp_local_means[c], FIX_DIM); sum_d_ijc += local_dist * local_dist; for (int p = 0; p < num_global_means; p ++) { double dist = df (data[i], temp_global_means[p], FIX_DIM); d_jcp[p] += dist * dist; } } int min_p = -1; double min_d_jcp = INF; for (int p = 0; p < num_global_means; p ++) if (d_jcp[p] < min_d_jcp) { min_p = p; min_d_jcp = d_jcp[p]; } assert (min_p >= 0); // cout << min_d_jcp << " " << lambda_global << " " << sum_d_ijc << endl; if (min_d_jcp > lambda_global + sum_d_ijc) { global_means.push_back(local_means[c]); // push mu_jc temp_global_means.push_back(vec2ins (local_means[c])); k[j][c] = num_global_means; // cout << "global increment" << endl; } else { k[j][c] = min_p; } } for (int c = 0; c < num_local_means; c ++) delete temp_local_means[c]; num_global_means = global_means.size(); for (int p = 0; p < num_global_means; p ++) delete temp_global_means[p]; } // 6. for each global clusters, compute_assignment (global_asgn, k, z, tables); /* cout << "compute global means.." << endl; cout << "#global created: " << global_means.size() << ", #global used: " << get_num_global_means(k); */ compute_means (data, global_asgn, FIX_DIM, global_means); // 7. convergence? new_cost = compute_cost (data, global_means, k, z, lambdas, tables, df, FIX_DIM); if ( new_cost < objmin ) objmin = new_cost; objmin_trace << omp_get_wtime()-start_time << " " << objmin << endl; if (new_cost == last_cost) break; if (new_cost < last_cost) { last_cost = new_cost; } else { cerr << "failure" << endl; return INF; assert(false); } } means = global_means; return last_cost; }// entry main function