/
PLSACluster.cpp
136 lines (115 loc) · 3.3 KB
/
PLSACluster.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
#include <iostream>
#include <stdlib.h>
#include <string>
#include <fstream>
#include <sstream>
#include "iPLSA.hpp"
#include <omp.h>
#include <map>
#include <vector>
#include <stdio.h>
#include <algorithm>
int cmp(const pair<int, double>& x, const pair<int, double>& y)
{
return x.second > y.second;
}
void sortMapByValue(map<int, double>& tMap, vector<pair<int, double> >& tVector)
{
for (map<int, double>::iterator curr = tMap.begin(); curr != tMap.end(); curr++)
{
tVector.push_back(make_pair(curr->first, curr->second));
}
sort(tVector.begin(), tVector.end(), cmp);
}
int loadmidinfo(ifstream &in , vector<int> & index2mid)
{
int index;
int mid;
string s;
while(getline(in,s))
{
istringstream in_str(s);
in_str>>index>>mid;
index2mid[index]=mid;
}
return 0;
}
int loadtaginfo(ifstream &in , vector<string> & index2tag)
{
int index;
string tag;
string s;
while(getline(in,s))
{
istringstream in_str(s);
in_str>>index>>tag;
index2tag[index]=(tag);
}
return 0;
}
int main(int argc, char * argv[])
{
if(argc!= 11)
{
cout<<"usage: PLSACluster <inputfile> <indexmidfile> <indextagfile> <crossfolds> <numTopics> <numIters> <anneal> <numBlocks> <top-k words> <pos>"<<endl;
cout<<"./PLSACluster data/inputtagsformat.txt data/indexmediaid.txt data/indextag.txt 10 200 200 100 8 50 0"<<endl;
return 1;
}
char* inputfile=argv[1]; // input file
char* indexmidfile=argv[2]; // mid inverted index table file
char* indextagfile=argv[3]; // tag inverted index table file
int crossfold=atoi(argv[4]); // cross validation dataset 10(1:9)
int numLS=atoi(argv[5]); // topic number
int numIters=atoi(argv[6]); // iterate number
int anneal=atoi(argv[7]); // simulated annealing
int numBlocks=atoi(argv[8]); // block number
int topk=atoi(argv[9]); // number of tags in each topics
int pos=atoi(argv[10]);
int cpu_core_nums = omp_get_num_procs();
omp_set_num_threads(cpu_core_nums);
iPLSA * plsa;
plsa=new iPLSA(inputfile,indexmidfile,indextagfile,crossfold, numLS, numIters, 1, 1, 0.552, anneal, 0.92, cpu_core_nums, numBlocks, pos);
plsa->run();
double ** p_d_z = plsa->get_p_d_z();
double ** p_w_z = plsa->get_p_w_z();
int document_num = plsa->numDocs();
int topic_num = plsa->numCats();
int word_num = plsa->numWords();
int midcount = plsa->numDocs();
vector<int> index2mid(midcount);
vector<string> index2tag(word_num);
ifstream in_inter(indexmidfile);
ifstream in_inter2(indextagfile);
loadmidinfo(in_inter,index2mid);
loadtaginfo(in_inter2,index2tag);
FILE *doc2topic_fp = fopen("doc2topic_distribution.txt","w");
if(doc2topic_fp==NULL) return -1;
for( int i = 0; i < document_num; ++i )
{
fprintf(doc2topic_fp, "%d ", index2mid[i]);
for( int j = 1; j < topic_num; ++j )
{
fprintf(doc2topic_fp, "%f ", p_d_z[i][j]);
}
fprintf(doc2topic_fp, "\n");
}
FILE *topic2word_fp = fopen("topic2word_distribution.txt","w");
if(doc2topic_fp==NULL)
return -1;
for( int i = 0; i < topic_num; ++i )
{
map<int,double> wMap;
for( int w = 0; w<word_num; w++ )
{
wMap[w] = p_w_z[w][i];
}
vector< pair<int, double> > wVector;
sortMapByValue(wMap,wVector);
for( int w = 1; w<=topk; w++ )
{
fprintf(topic2word_fp, "%s:%f ",index2tag[wVector[w].first].c_str(), wVector[w].second);
}
fprintf(topic2word_fp, "\n");
}
return 0;
}