void readTokenMatrix(Dataset& data, const char* filename) { std::vector<string> lines; loadFile(lines, filename); assert(lines.size() > 1); std::vector<int> counts(2); tokenizeLineIntoIntVector(lines[1], counts); assert(counts.size() == 2); //cout << "Found: " << counts[0] << " and " << counts[1] << endl; data.numDocs = counts[0]; data.numTokens = counts[1]; vec trainClasses(data.numDocs); mat features(data.numDocs, data.numTokens); rowvec row(data.numTokens); for(uint i = 3; i < data.numDocs + 3; i++) { std::vector<double> tokens(getNumTokens(lines[i])); tokenizeLineIntoDoubleVector(lines[i], tokens); trainClasses(i-3) = tokens[0]; //cout << "class[" << tokens[0] << "]"; row.zeros(); int cumsum = 0; for(uint j = 1; j < tokens.size() - 1; j+=2) { cumsum += tokens[j]; //cout << "cumsum[" << cumsum << "]token[" << tokens[j+1] << "]" << flush ; row[cumsum] = tokens[j+1]; } Matrix::setMatrixRowToVector(features, i-3 ,row); } data.classifications = trainClasses; data.features = features; }
Impl(const std::string& learnpath, const std::string& allchars) { std::set<char> goodChars(allchars.begin(), allchars.end()); std::ifstream train((learnpath + "/train.txt").c_str()); std::vector< std::pair<char, std::string> > samples; char symbol; std::string imageFile; while (train >> symbol >> imageFile) { if (goodChars.find(symbol) == goodChars.end()) continue; samples.push_back(std::make_pair(symbol, imageFile)); } cv::Mat trainData(samples.size(), FEATURE_COUNT, cv::DataType<float>::type); cv::Mat trainClasses(samples.size(), 1, cv::DataType<float>::type); for (size_t i = 0; i < samples.size(); ++i) { std::string path = learnpath + samples[i].second; cv::Mat inp = cv::imread(path, 0), out, canny; if (inp.empty()) continue; cv::Canny(inp, canny, 100, 50, 3); std::vector< std::vector<cv::Point> > contours; cv::findContours(canny, contours, CV_RETR_LIST, CV_CHAIN_APPROX_SIMPLE, cv::Point(0, 0)); int maxx = -1, maxy = -1, minx = 1e9, miny = 1e9; for (size_t j = 0; j < contours.size(); ++j) { cv::Rect r = cv::boundingRect(contours[j]); if (r.x + r.width > maxx) maxx = r.x + r.width; if (r.y + r.height > maxy) maxy = r.y + r.height; if (r.x < minx) minx = r.x; if (r.y < miny) miny = r.y; } cv::Rect bound(minx, miny, maxx - minx, maxy - miny); cv::resize(cv::Mat(inp, bound), out, cv::Size(10, 16), 0, 0, cv::INTER_CUBIC); for (int j = 0; j < FEATURE_COUNT; ++j) trainData.at<float>(i, j) = out.data[j]; trainClasses.at<float>(i, 0) = samples[i].first; } oracle.train(trainData, trainClasses); }