/
CBinaryBayes.cpp
236 lines (202 loc) · 7.05 KB
/
CBinaryBayes.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
#include <heur/CBinaryBayes.h>
#include <map>
#include <data/CDataset.h>
#include <data/CDataset.h>
#include <util/Util.h>
#include <algorithm>
#include <vector>
#include <stdlib.h>
#include <string>
#include <cmath>
//#define DEBUG
#define INTERVAL_TIME 30
CBinaryBayes::CBinaryBayes() {}
//used for cross-validation: allow to ignore a certain part of the trainfile
void CBinaryBayes::trainOnline(std::string filename, int n, int numFeat, int ignore_start, int ignore_end) {
this->m = numFeat;
std::ifstream input(filename.c_str());
std::string line;
if(!input)
reportError("File not found ...");
clock_t tbegin = clock();
clock_t tprev = clock();
clock_t tend = clock();
//Prepare the vectors to avoid unnecessary space usage
this->P0N.reserve(m);
this->P1N.reserve(m);
this->P0P.reserve(m);
this->P1P.reserve(m);
this->isKnown.reserve(m);
//PASS 1: determine probabilities for each class
int classCountPos = 0;
int classCountNeg = 0;
std::vector<int> oneCountPos; //How many ones in for a features given that the class value is Pos
std::vector<int> oneCountNeg; //How many ones in for a features given that the class value is Neg
for(int i = 0; i < m; i++) {
oneCountPos .push_back(0);
oneCountNeg .push_back(0);
this->isKnown.push_back(false);
}
//PASS 1: Add all training points
for(int i = 0; i<n; i++) {//i<n
getline( input, line );
if(!input) break; //Stop at end of file
if(i > ignore_start && i < ignore_end) continue; //Ignore part of the file
std::vector<std::string> tokens = lineToTokens(line.c_str(),' ');
//Update prior probability count
int classval = atoi(tokens[0].c_str());
if(classval != -1 && classval != 1)
reportError("Class values should be either -1 or 1.");
bool hit = classval > 0;
if(hit) classCountPos ++;
else classCountNeg ++;
int prev_item = 0;
for(std::vector<std::string>::size_type j = 1; j < tokens.size(); j++) {
std::string token = tokens[j];
std::vector<std::string> pair = lineToTokens(token.c_str(),':');
if(pair.size() < 2) continue; //extra space element
int feat = lexical_cast<int,std::string>(pair[0]);
//double val = lexical_cast<double,std::string>(pair[1]); //Not used, assumed to be equal to one
if(feat > this->m) {
std::stringstream s;
s << "Found a feature with dimension higher than actual dataset dimension (" << m << " vs " << feat << ").";
reportError(s.str());
}
if(hit) {
oneCountPos[feat]++;
}
else {
oneCountNeg[feat]++;
}
}
#ifdef DEBUG
tend = clock();
if( (tend-tprev)/ CLOCKS_PER_SEC > INTERVAL_TIME ) {
int done = i;
int left = 2*n-i;
double timedone = double(tend-tbegin) / CLOCKS_PER_SEC;
double timeleft = timedone / done * left;
LOG << "Time " << done << "/" << 2*n << ":" << timeleft << "s\n";
tprev = tend;
}
#endif
}
//Intermediate: format all counts to (log)probabilities
this->prior = classCountPos * 1.0 / n;
double priorNeg = 1-prior;
this->allZeroSumPos = 0;
this->allZeroSumNeg = 0;
for(int i = 0; i < this->m; i++) {
int zeroCountPos = classCountPos - oneCountPos[i];
int zeroCountNeg = classCountNeg - oneCountNeg[i];
//Score class 1
double d = 1;
double P0Pi = log((1.0*zeroCountPos+1) / (classCountPos+2));
double P1Pi = log((1.0*oneCountPos[i]+1) / (classCountPos+2));
this->P0P.push_back(P0Pi);
this->P1P.push_back(P1Pi);
//Score class 2
double P0Ni = log((1.0*zeroCountNeg+1) / (classCountNeg+1));
double P1Ni = log((1.0* oneCountNeg[i]+2) / (classCountNeg+2));
this->P0N.push_back(P0Ni);
this->P1N.push_back(P1Ni);
//Score class 0 (is the negative value of the other one ...)
isKnown[i] = (oneCountPos[i] > 0) && (oneCountNeg[i] > 0);
this->allZeroSumPos += P0Pi;
this->allZeroSumNeg += P0Ni;
#ifdef DEBUG
//Timer update
tend = clock();
if( (tend-tprev)/ CLOCKS_PER_SEC > INTERVAL_TIME ) {
int done = n+i;
int left = 2*n-i;
float timedone = double(tend-tbegin) / CLOCKS_PER_SEC;
float timeleft = timedone / done * left;
LOG << "Time " << done << "/" << n << ":" << timeleft << "s\n";
tprev = tend;
}
#endif
}
this->treshold = 0;
}
void CBinaryBayes::trainOnline(std::string filename,int cut, int m) {
int n = cut;
int numFeat = m;
this->trainOnline(filename, cut, m, cut,cut);
}
//Ranking NB
double CBinaryBayes::score(CSparseList * list) {
std::vector<sparse_element> elements= *(list->getElements());
double possum = log(this->prior) + this->allZeroSumPos;
double negsum = log(1-this->prior) + this->allZeroSumNeg;
int skipped = 0;
for(size_t d = 0; d < elements.size(); d++) {
sparse_element element = elements[d];
if(isKnown[element.index]) {
possum += this->P1P[element.index];
possum -= this->P0P[element.index];
negsum += this->P1N[element.index];
negsum -= this->P0N[element.index];
}else skipped ++;
}
return possum-negsum;
}
double CBinaryBayes::predict(CSparseList * list) {
double score = this->score(list);
if((score - this->treshold) > 0)
return 1;
else
return -1;
}
void CBinaryBayes::scoreAndPredict(CSparseList * list, int & prediction, double & score) {
score = this->score(list);
prediction = (score - this->treshold > 0)? 1 : -1;
}
void CBinaryBayes::saveModel(std::string fileName) {
std::ofstream fh (fileName.c_str());
//Output general info on first line
fh << this->m <<" "<<this->Cpos <<" "<<this->Cneg <<" "<<this->prior<<" "<<this->allZeroSumPos<<" "<<this->allZeroSumNeg<<"\n";
//Output class probabilities line per line
for(int i = 0; i < m; i++) {
fh << this->isKnown[i]<<" "<<this->P0P[i]<<" "<<this->P0N[i] <<" "<<this->P1P[i]<<" "<<this->P1N[i]<<"\n";
}
fh.close();
}
void CBinaryBayes::loadModel(std::string fileName) { //TODO:
if(!fileExists(fileName))
reportError("Couldn't load model file.");
std::ifstream input (fileName.c_str());
std::string line;
//Get general data from first line
getline(input, line);
std::vector<std::string> splits = lineToTokens(line.c_str(),' ');
this->m = lexical_cast<int,std::string>(splits[0]);
this->Cpos = lexical_cast<double,std::string>(splits[1]);
this->Cneg = lexical_cast<double,std::string>(splits[2]);
this->prior= lexical_cast<double,std::string>(splits[3]);
this->allZeroSumPos = lexical_cast<double,std::string>(splits[4]);
this->allZeroSumNeg = lexical_cast<double,std::string>(splits[5]);
//Input feature priors
this->isKnown.clear();
this->P0N.clear();
this->P0P.clear();
this->P1N.clear();
this->P1P.clear();
this->P0N.reserve(m);
this->P0P.reserve(m);
this->P1N.reserve(m);
this->P1P.reserve(m);
this->isKnown.reserve(m);
for(int i = 0; i < m; i++) {
getline(input, line);
std::vector<std::string> splits = lineToTokens(line.c_str(),' ');
this->isKnown.push_back(lexical_cast<bool,std::string>(splits[0]));
this->P0P.push_back(lexical_cast<double,std::string>(splits[1]));
this->P0N.push_back(lexical_cast<double,std::string>(splits[2]));
this->P1P.push_back(lexical_cast<double,std::string>(splits[3]));
this->P1N.push_back(lexical_cast<double,std::string>(splits[4]));
}
input.close();
}
CBinaryBayes::~CBinaryBayes() {
}