/
train.cpp
223 lines (195 loc) · 7.71 KB
/
train.cpp
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
#include "mnistreader.h"
#include "predictor.h"
#include <iostream>
#include <fstream>
#include <random>
#include <string>
#include <stdexcept>
#include <cstdlib>
#include <fenv.h>
#include <Eigen/Core>
class Trainee : protected Predictor{
public:
using AnswerType = int;
Trainee(int n_hid1, int n_hid2, float init_sigma);
void train(std::vector<std::pair<InputType, AnswerType>> minibatch, float learning_rate);
bool dump(const char *traindatapath);
private:
/* For AdaGrad */
Eigen::ArrayXXf gsq_w1;
Eigen::ArrayXf gsq_b1;
Eigen::ArrayXXf gsq_w2;
Eigen::ArrayXf gsq_b2;
Eigen::ArrayXXf gsq_w3;
Eigen::ArrayXf gsq_b3;
};
Trainee::Trainee(int n_hid1, int n_hid2, float init_sigma)
{
n_inputvec = MNISTreader::pixelSize;
n_hid1vec = n_hid1;
n_hid2vec = n_hid2;
n_outputvec = 10;
gsq_w1 = Eigen::ArrayXXf::Zero(n_hid1vec, n_inputvec);
gsq_b1 = Eigen::ArrayXf::Zero(n_hid1vec);
gsq_w2 = Eigen::ArrayXXf::Zero(n_hid2vec, n_hid1vec);
gsq_b2 = Eigen::ArrayXf::Zero(n_hid2vec);
gsq_w3 = Eigen::ArrayXXf::Zero(n_outputvec, n_hid2vec);
gsq_b3 = Eigen::ArrayXf::Zero(n_outputvec);
std::random_device rd;
std::mt19937 mt(rd());
std::normal_distribution<float> nd(0.0, init_sigma);
weight1 = Eigen::MatrixXf(n_hid1vec, n_inputvec);
for(int i=0;i<n_hid1vec;i++) for(int j=0;j<n_inputvec;j++) weight1(i, j) = nd(mt);
bias1 = Eigen::VectorXf::Zero(n_hid1vec);
weight2 = Eigen::MatrixXf(n_hid2vec, n_hid1vec);
for(int i=0;i<n_hid2vec;i++) for(int j=0;j<n_hid1vec;j++) weight2(i, j) = nd(mt);
bias2 = Eigen::VectorXf::Zero(n_hid2vec);
weight3 = Eigen::MatrixXf(n_outputvec, n_hid2vec);
for(int i=0;i<n_outputvec;i++) for(int j=0;j<n_hid2vec;j++) weight3(i, j) = nd(mt);
bias3 = Eigen::VectorXf::Zero(n_outputvec);
}
void Trainee::train(std::vector<std::pair<InputType, AnswerType>> minibatch, float learning_rate)
{
Eigen::MatrixXf dweight3 = Eigen::MatrixXf::Zero(n_outputvec, n_hid2vec);
Eigen::VectorXf dbias3 = Eigen::VectorXf::Zero(n_outputvec);
Eigen::MatrixXf dweight2 = Eigen::MatrixXf::Zero(n_hid2vec, n_hid1vec);
Eigen::VectorXf dbias2 = Eigen::VectorXf::Zero(n_hid2vec);
Eigen::MatrixXf dweight1 = Eigen::MatrixXf::Zero(n_hid1vec, n_inputvec);
Eigen::VectorXf dbias1 = Eigen::VectorXf::Zero(n_hid1vec);
/* For AdaGrad */
auto fn = [](float lhs, float rhs) -> float { return lhs != 0.0 ? lhs / rhs : 0.0; };
for(auto sample: minibatch){
Eigen::VectorXf inputvec = input2vec(sample.first);
Eigen::VectorXf z1 = feedforward(inputvec, 1);
Eigen::VectorXf z2 = feedforward(inputvec, 2); // 後付けとはいえ。この計算、あからさまに無駄だな。z1からz2を計算すべき。
// Calculate delta of output layer.
Eigen::VectorXf delta3;
delta3 = feedforward(inputvec, 3);
delta3(sample.second) -= 1.0f;
{
Eigen::ArrayXXf e = delta3 * z2.transpose();
gsq_w3 += e * e;
gsq_b3 += delta3.array() * delta3.array();
dweight3 += e.matrix();
dbias3 += delta3;
}
// Calculate delta of 2nd hidden layer.
Eigen::VectorXf delta2 = Eigen::VectorXf::Zero(n_hid2vec);
for(int j=0;j<n_hid2vec;j++){
for(int k=0;k<n_outputvec;k++) delta2(j) += delta3(k) * weight3(k, j) * (z2(j) >= 0.f ? 1.f : 0.f);
}
{
Eigen::ArrayXXf e = delta2 * z1.transpose();
gsq_w2 += e * e;
gsq_b2 += delta2.array() * delta2.array();
dweight2 += e.matrix();
dbias2 += delta2;
}
// Calculate delta of 1st hidden layer.
Eigen::VectorXf delta1 = Eigen::VectorXf::Zero(n_hid1vec);
for(int j=0;j<n_hid1vec;j++){
for(int k=0;k<n_hid2vec;k++) delta1(j) += delta2(k) * weight2(k, j) * (z1(j) >= 0.f ? 1.f : 0.f);
}
{
Eigen::ArrayXXf e = delta1 * inputvec.transpose();
gsq_w1 += e * e;
gsq_b1 += delta1.array() * delta1.array();
dweight1 += e.matrix();
dbias1 += delta1;
}
}
weight1 -= dweight1.binaryExpr(gsq_w1.sqrt().matrix(), fn) * learning_rate / minibatch.size();
bias1 -= dbias1.binaryExpr(gsq_b1.sqrt().matrix(), fn) * learning_rate / minibatch.size();
weight2 -= dweight2.binaryExpr(gsq_w2.sqrt().matrix(), fn) * learning_rate / minibatch.size();
bias2 -= dbias2.binaryExpr(gsq_b2.sqrt().matrix(), fn) * learning_rate / minibatch.size();
weight3 -= dweight3.binaryExpr(gsq_w3.sqrt().matrix(), fn) * learning_rate / minibatch.size();
bias3 -= dbias3.binaryExpr(gsq_b3.sqrt().matrix(), fn) * learning_rate / minibatch.size();
}
bool Trainee::dump(const char *traindatapath)
{
std::ofstream dat(traindatapath);
if(!dat.good()) return false;
dat << n_inputvec << ' ' << n_hid1vec << ' ' << n_hid2vec << ' ' << n_outputvec << '\n';
int i, j;
for(i=0;i<n_hid1vec;i++){
for(j=0;j<n_inputvec-1;j++) dat << weight1(i, j) << ' ';
dat << weight1(i, j) << '\n';
}
for(j=0;j<n_hid1vec-1;j++) dat << bias1(j) << ' ';
dat << bias1(j) << '\n';
for(i=0;i<n_hid2vec;i++){
for(j=0;j<n_hid1vec-1;j++) dat << weight2(i, j) << ' ';
dat << weight2(i, j) << '\n';
}
for(j=0;j<n_hid2vec-1;j++) dat << bias2(j) << ' ';
dat << bias2(j) << '\n';
for(i=0;i<n_outputvec;i++){
for(j=0;j<n_hid2vec-1;j++) dat << weight3(i, j) << ' ';
dat << weight3(i, j) << '\n';
}
for(j=0;j<n_outputvec-1;j++) dat << bias3(j) << ' ';
dat << bias3(j) << '\n';
if(!dat.good()) return false;
return true;
}
void train(const char *imagespath, const char *labelspath, int n_hid1, int n_hid2, float sigma, float epsilon)
{
MNISTreader reader(imagespath, labelspath);
Trainee trainee(n_hid1, n_hid2, sigma);
//feenableexcept(FE_DIVBYZERO | FE_INVALID | FE_OVERFLOW);
if(reader.isError()){
std::string errstring;
switch(reader.getError()){
case MNISTreaderError::imagesFileIOFailure:
errstring = "Failed to open ";
errstring += imagespath;
break;
case MNISTreaderError::labelsFileIOFailure:
errstring = "Failed to open ";
errstring += labelspath;
break;
case MNISTreaderError::imagesFileMalformed:
case MNISTreaderError::unexpectedPixelsSize:
errstring = "Broken or invalid file ";
errstring += imagespath;
break;
case MNISTreaderError::labelsFileMalformed:
errstring = "Broken or invalid file ";
errstring += labelspath;
break;
case MNISTreaderError::differentNumberOfImage:
errstring = imagespath;
errstring += " and ";
errstring += labelspath;
errstring += " are not corresponding";
break;
}
throw std::invalid_argument(errstring);
}
int n_trained = 0;
while(1){
auto minibatch = reader.minibatch(50);
if(minibatch.size() == 0) break;
trainee.train(minibatch, epsilon);
n_trained += 50;
std::cout << '\r' << n_trained << "/" << reader.length() << std::flush;
}
std::cout << std::endl;
trainee.dump("traindata");
}
int main(int argc, char **argv)
{
float sigma = 0.2;
float epsilon = 0.8;
int n_hid1 = 100;
int n_hid2 = 20;
if(argc == 2) epsilon = std::atof(argv[1]);
else if(argc >= 3){
n_hid1 = std::atoi(argv[1]);
n_hid2 = std::atoi(argv[2]);
if(argc >= 4) epsilon = std::atof(argv[3]);
if(argc >= 5) sigma = std::atof(argv[4]);
}
train("train-images-idx3-ubyte", "train-labels-idx1-ubyte", n_hid1, n_hid2, sigma, epsilon);
return 0;
}