-
Notifications
You must be signed in to change notification settings - Fork 1
/
sgd_static_kdtree.cpp
268 lines (217 loc) · 8.78 KB
/
sgd_static_kdtree.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
/**
* @file
* @author Danny Bickson
* @version 1.0
*
* @section LICENSE
*
* Copyright [2012] [Carnegie Mellon University]
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*
* @section DESCRIPTION
*
* Matrix factorization with the Stochastic Gradient Descent (SGD) algorithm.
* Algorithm is described in the papers:
* 1) Matrix Factorization Techniques for Recommender Systems Yehuda Koren, Robert Bell, Chris Volinsky. In IEEE Computer, Vol. 42, No. 8. (07 August 2009), pp. 30-37.
* 2) Takács, G, Pilászy, I., Németh, B. and Tikk, D. (2009). Scalable Collaborative Filtering Approaches for Large Recommender Systems. Journal of Machine Learning Research, 10, 623-656.
*
*
*/
#include "eigen_wrapper.hpp"
#include "common.hpp"
double sgd_lambda = 1e-3; //sgd regularization
double sgd_gamma = 1e-3; //sgd step size
double sgd_step_dec = 0.9; //sgd step decrement
struct vertex_data {
vec pvec; //storing the feature vector
vertex_data() {
pvec = zeros(D);
}
void set_val(int index, float val){
pvec[index] = val;
}
float get_val(int index){
return pvec[index];
}
};
#include "util.hpp"
/**
* Type definitions. Remember to create suitable graph shards using the
* Sharder-program.
*/
typedef vertex_data VertexDataType;
typedef float EdgeDataType; // Edges store the "rating" of user->movie pair
graphchi_engine<VertexDataType, EdgeDataType> * pengine = NULL;
graphchi_engine<VertexDataType, EdgeDataType> * pvalidation_engine = NULL;
std::vector<vertex_data> latent_factors_inmem;
#include "rmse.hpp"
#include "rmse_engine.hpp"
#include "topn_engine.hpp"
#include "topn_engine_kd2.hpp"
#include "io.hpp"
#include "kdtree.hpp"
/** compute a missing value based on SGD algorithm */
float sgd_predict(const vertex_data& user,
const vertex_data& movie,
const float rating,
double & prediction,
void * extra = NULL){
prediction = dot_prod(user.pvec,movie.pvec);
//truncate prediction to allowed values
prediction = std::min((double)prediction, maxval);
prediction = std::max((double)prediction, minval);
//return the squared error
float err = rating - prediction;
assert(!std::isnan(err));
return err*err;
}
/**
* GraphChi programs need to subclass GraphChiProgram<vertex-type, edge-type>
* class. The main logic is usually in the update function.
*/
struct SGDVerticesInMemProgram : public GraphChiProgram<VertexDataType, EdgeDataType> {
mutex lock;
/**
* Called before an iteration is started.
*/
void before_iteration(int iteration, graphchi_context &gcontext) {
reset_rmse(gcontext.execthreads);
}
/**
* Called after an iteration has finished.
*/
void after_iteration(int iteration, graphchi_context &gcontext) {
sgd_gamma *= sgd_step_dec;
training_rmse(iteration, gcontext);
run_validation(pvalidation_engine, gcontext);
/* std::vector<double> lbo (D,0);
std::vector<double> rbo (D,1);
double copy[N];
for(int j = 0; j < D; j++)
{
for(unsigned int i = M; i < M + N; i++)
copy[i - M] = latent_factors_inmem.at(i).pvec[j];
std::sort(copy, copy+N);
lbo.at(j) = copy[0];
rbo.at(j) = copy[N-1];
}
std::cout << "left bound: (";
for(int i = 0; i < D; i++)
std::cout << lbo.at(i) << ", ";
std::cout << "). right bound: (";
for(int i = 0; i < D; i++)
std::cout << rbo.at(i) << ", ";
std::cout << ")." << std::endl;*/
}
/**
* Vertex update function.
*/
void update(graphchi_vertex<VertexDataType, EdgeDataType> &vertex, graphchi_context &gcontext) {
//go over all user nodes
if ( vertex.num_outedges() > 0){
vertex_data & user = latent_factors_inmem[vertex.id()];
//go over all ratings
for(int e=0; e < vertex.num_edges(); e++) {
float observation = vertex.edge(e)->get_data();
vertex_data & movie = latent_factors_inmem[vertex.edge(e)->vertex_id()];
double estScore;
rmse_vec[omp_get_thread_num()] += sgd_predict(user, movie, observation, estScore);
double err = observation - estScore;
if (std::isnan(err) || std::isinf(err))
logstream(LOG_FATAL)<<"SGD got into numerical error. Please tune step size using --sgd_gamma and sgd_lambda" << std::endl;
// lock.lock();
// std::cout << err << std::endl;
// lock.unlock();
//NOTE: the following code is not thread safe, since potentially several
//user nodes may updates this item gradient vector concurrently. However in practice it
//did not matter in terms of accuracy on a multicore machine.
//if you like to defend the code, you can define a global variable
//mutex mymutex;
//
//and then do: mymutex.lock()
movie.pvec += sgd_gamma*(err*user.pvec - sgd_lambda*movie.pvec);
//and here add: mymutex.unlock();
user.pvec += sgd_gamma*(err*movie.pvec - sgd_lambda*user.pvec);
}
}
}
};
//dump output to file
void output_sgd_result(std::string filename) {
MMOutputter_mat<vertex_data> user_mat(filename + "_U.mm", 0, M, "This file contains SGD output matrix U. In each row D factors of a single user node.", latent_factors_inmem);
MMOutputter_mat<vertex_data> item_mat(filename + "_V.mm", M ,M+N, "This file contains SGD output matrix V. In each row D factors of a single item node.", latent_factors_inmem);
logstream(LOG_INFO) << "SGD output files (in matrix market format): " << filename << "_U.mm" <<
", " << filename + "_V.mm " << std::endl;
}
int main(int argc, const char ** argv) {
// print_copyright();
write_copyright();
//* GraphChi initialization will read the command line arguments and the configuration file. */
graphchi_init(argc, argv);
/* Metrics object for keeping track of performance counters
and other information. Currently required. */
metrics m("sgd-inmemory-factors");
/* Basic arguments for application. NOTE: File will be automatically 'sharded'. */
sgd_lambda = get_option_float("sgd_lambda", 1e-3);
sgd_gamma = get_option_float("sgd_gamma", 1e-3);
sgd_step_dec = get_option_float("sgd_step_dec", 0.9);
int file_format = get_option_int("ff", 3);
parse_command_line_args();
parse_implicit_command_line();
/* Preprocess data if needed, or discover preprocess files */
int nshards = convert_matrixmarket<EdgeDataType>(training, 0, 0, file_format, TRAINING, false);
init_feature_vectors<std::vector<vertex_data> >(M+N, latent_factors_inmem, !load_factors_from_file);
if (validation != ""){
int vshards = convert_matrixmarket<EdgeDataType>(validation, 0, 0, 3, VALIDATION, false);
init_validation_rmse_engine<VertexDataType, EdgeDataType>(pvalidation_engine, vshards, &sgd_predict);
}
/* load initial state from disk (optional) */
if (load_factors_from_file){
load_matrix_market_matrix(training + "_U.mm", 0, D);
load_matrix_market_matrix(training + "_V.mm", M, D);
}
print_config();
/* Run */
SGDVerticesInMemProgram program;
graphchi_engine<VertexDataType, EdgeDataType> engine(training, nshards, false, m);
set_engine_flags(engine);
pengine = &engine;
timer train_timer;
engine.run(program, niters);
// std::cout << "Trn Time for file test: " << std::setw(10) << train_timer.current_time() / niters << std::endl;
std::ofstream ofs(result.c_str(), std::ofstream::out | std::ofstream::app);
ofs << D << " " << train_timer.current_time() << " ";
/* Run TopN program */
n_top = get_option_int("n_int", 10);
/*timer test_timer1;
ofs << test_timer1.current_time() << " ";*/
//run_general_topn_program(pengine, &latent_factors_inmem, &sgd_predict);
timer index_timer;
kd_Node* mroot = init_kdtree(&latent_factors_inmem);
ofs << index_timer.current_time() << " ";
timer test_timer;
/* construct kd tree index */
// ofs << "constructing index: " << test_timer.current_time() << " ";
run_kd_topn_program(pengine, &latent_factors_inmem, mroot);
// std::coua << "Tst Time: " << std::setw(10) << test_timer.current_time() << std::endl;
ofs << test_timer.current_time() << std::endl;
ofs.close();
/* Output latent factor matrices in matrix-market format */
output_sgd_result(training);
test_predictions(&sgd_predict);
/* Report execution metrics */
if (!quiet)
metrics_report(m);
return 0;
}