// Reads a Bipartiteite graph from stdin, then returns a copy of it. In the future we may want to implement it without any copying. Bipartite *readBipartiteGraph() { unsigned numberOfXVertices, numberOfYVertices, edges; // Läs antal hörn X, Y och kanter scanf("%d %d %d", &numberOfXVertices, &numberOfYVertices, &edges); Bipartite *graph = new Bipartite(numberOfXVertices,numberOfYVertices); // Läs in kanterna for (unsigned i = 0; i < edges; ++i) { unsigned from, to; scanf("%d %d", &from, &to); graph->addEdge(from-1,to-1); } return graph; }
/* * Construct the sorted features (words) for each dataset. * */ int main(int argc, char** argv) { if(argc != 2){ cout<<"please specify scale.."<<endl; exit(0); } int scale = atoi(argv[1]); // read the data source file string dataset = "restaurant"; vector<string> source1 = FileIO::readFileLines("data/"+dataset+"/source_1.txt"); vector<string> source2 = FileIO::readFileLines("data/"+dataset+"/source_2.txt"); int N1 = source1.size(); int N2 = source2.size(); int N = N1+N2; int pair_num = N*N; srand (time(NULL)); cout<<"finish loading source fle..."<<endl; // scan the source file and hash the word from string to int map<string, int> word_id; map<int, string> id_word; for(size_t i=0;i < source1.size();i++){ vector<string> segs = split(source1[i], ' '); for(size_t j=0;j < segs.size();j++){ string word = format(segs[j]); int id = getWordId(word, word_id); word_id[word] = id; id_word[id] = word; } } for(size_t i=0;i < source2.size();i++){ vector<string> segs = split(source2[i], ' '); for(size_t j=0;j < segs.size();j++){ string word = format(segs[j]); int id = getWordId(word, word_id); word_id[word] = id; id_word[id] = word; } } cout<<"finish constructing word id..."<<endl; // construct the inverted lists. Each list is sorted by entity id. vector<set<int> > combine_inv_lists(word_id.size()); vector<set<int> > inv_lists1(word_id.size()); for(size_t i=0;i < source1.size();i++){ vector<string> segs = split(source1[i], ' '); for(size_t j=0;j < segs.size();j++){ string word = format(segs[j]); int id = word_id[word]; inv_lists1[id].insert(i); combine_inv_lists[id].insert(i); } } vector<set<int> > inv_lists2(word_id.size()); for(size_t i=0;i < source2.size();i++){ vector<string> segs = split(source2[i], ' '); for(size_t j=0;j < segs.size();j++){ string word = format(segs[j]); int id = word_id[word]; inv_lists2[id].insert(i); combine_inv_lists[id].insert(i); } } cout<<"finish constructing inverted lists..."<<endl; // refine the inverted lists by removing stop-words. for(size_t wid=0;wid < word_id.size();wid++){ int len = combine_inv_lists[wid].size(); if(len == 1 || len > 0.1*scale*N){ inv_lists1[wid].clear(); inv_lists2[wid].clear(); combine_inv_lists[wid].clear(); } } // construct the bipartite graph between entity-pairs and terms int word_num = word_id.size(); Bipartite *bigraph = new Bipartite(pair_num, word_num); cout<<"finish init bigraph"<<endl; for(size_t wid=0;wid < word_id.size();wid++){ for(set<int>::iterator id1=inv_lists1[wid].begin();id1!=inv_lists1[wid].end();id1++){ for(set<int>::iterator id2=inv_lists2[wid].begin();id2!=inv_lists2[wid].end();id2++){ int pid = (*id1)*N+N1+(*id2); bigraph->addEdge(pid, wid); //cout<<(*id1)<<"\t"<<(*id2)<<"\t"<<pid<<"\t"<<wid<<"\t"<<id_word[wid]<<endl; } } } bigraph->init(); bigraph->iterate(); bigraph->output(id_word, combine_inv_lists); //exit(0); cout<<"edge num: "<<bigraph->activePairNum()<<endl; RandomWalk* walker; for(int iter=0;;iter++){ walker = new RandomWalk(bigraph->p_score, N1+N2, 20, 60, 10); for(int i=0;i < N1;i++){ int id1=i; for(int j=0;j < N2;j++){ int id2=j+N1; if(bigraph->p_score[id1*N+id2]>0){ bigraph->p_score[id2*N+id1]=bigraph->p_score[id1*N+id2]; walker->addEdge(id1,id2); } } } cout<<"edge num: "<<bigraph->activePairNum()<<endl; walker->iterate(); if(iter==5){ break; } bigraph->updatePScore(walker->p_conf); bigraph->output(id_word, combine_inv_lists); } /* RandomWalk* walker = new RandomWalk(bigraph->p_score, N); for(int iter=1;iter <= 200;iter++){ cout<<"iteration "<<iter++<<endl; if(walker->iterate()==0){ break; } } */ set<string> matches = FileIO::readMatch("data/"+dataset+"/match.txt"); int Num=1000; double max_weight=-1; vector<vector<int> > buckets(Num); for(int i=0;i < N1;i++){ int id1=i; for(int j=0;j < N2;j++){ int id2=j+N1; double weight = walker->p_conf[id1*N+id2]*bigraph->p_score[id1*N+id2]; if(weight > max_weight){ max_weight=weight; } } } double seg = max_weight/Num+0.001; for(int i=0;i < N1;i++){ int id1=i; for(int j=0;j < N2;j++){ int id2=j+N1; double weight = walker->p_conf[id1*N+id2]*bigraph->p_score[id1*N+id2]; int idx=(int)(weight/seg); buckets[idx].push_back(id1*N+id2); } } vector<double> conf_vec; int count=0,total_pair=0; for(int i=Num-1;i>0;i--){ for(size_t j=0;j < buckets[i].size();j++){ int key = buckets[i][j]; int id1 = key/N; int id2 = key%N; if(id1 < id2){ stringstream ss; ss<<id1<<"_"<<(id2-N1); cout<<id1<<"\t"<<id2<<"("<<id2-N1<<")\t"<<walker->p_conf[id1*N+id2]<<"\t"<<bigraph->p_score[id1*N+id2]<<"\t"; if(matches.find(ss.str()) != matches.end()){ cout<<"true"; }else{ cout<<"false"; } conf_vec.push_back(walker->p_conf[id1*N+id2]); size_t C=10; if(conf_vec.size() > C){ double avg_conf=0.0; for(size_t z=conf_vec.size()-1;z >= conf_vec.size()-C;z--){ avg_conf += conf_vec[z]; } if(walker->p_conf[id1*N+id2]>0.9999 && avg_conf/C >= 0.98){ total_pair++; if(matches.find(ss.str()) != matches.end()){ count++; } } cout<<"\t"<<avg_conf/C; }else{ total_pair++; if(matches.find(ss.str()) != matches.end()){ count++; } } cout<<endl; } } } /* int count=0, total_pair=0; set<string> results; for(int i=0;i < N1;i++){ int id1=i; for(int j=0;j < N2;j++){ int id2=j+N1; if(walker->p_conf[id1*N+id2]>0.95){ stringstream ss; ss<<id1<<"_"<<j; results.insert(ss.str()); total_pair++; if(matches.find(ss.str()) != matches.end()){ count++; }else{ cout<<"not_match: "<<id1<<"\t"<<j<<"("<<(j+N1)<<")\t"<<bigraph->p_score[id1*N+id2]<<endl; } } } } for(set<string>::iterator iter=matches.begin();iter != matches.end();iter++){ if(results.find(*iter) == results.end()){ cout<<"miss\t"<<*iter<<endl; } } */ double precison = 1.0*count/total_pair; double recall = 1.0*count/matches.size(); double f1 = 2*precison*recall/(precison+recall); cout<<"precision: "<<count<<"\t"<<total_pair<<"\t"<<precison<<"\t"<<recall<<"\t"<<f1<<endl; return 1; }