int main() { fstream trainFile("train.csv"); vector<vector<double> > data; CSV::readDataFromFile(trainFile, data, rowLength); std::cout << "Done reading train data. " << data.size() << " records" << std::endl; classifier.initContain(data); classifier.classify(data); std::cout << "Done classifying" << std::endl; regressionRunner.init(data, classifier, rowLength - 1); std::cout << "Done initializing regression runner" << std::endl; classifier.save(); std::cout << "Done saving classifier" << std::endl; regressionRunner.saveData(); std::cout << "Done saving regression data" << std::endl; return 0; }
bool InputReader::readInput(int argc, char** argv) { try { po::options_description generic("Generic options"); generic.add_options() ("help", "produce help message") ("properties-file,p", po::value<std::string>(&propertiesFilename)->implicit_value(""), "provide filepath to file with properties") ("cross-validate,C", po::value<std::string>(&toCrossValidateFilename)->implicit_value(""), "10-fold cross-validate provided dataset") ("standardize,S", "standardize data in provided training and testing datasets") ; po::options_description config("Configuration options"); config.add_options() ("result-file,r", po::value<std::string>(&resultFilename)->implicit_value("Results/results_knn.txt"), "provide filepath to file where results should be stored") ("training-file,t", po::value<std::string>(&trainFilename)->implicit_value("Datasets/ftrain01.txt"), "provide filepath to file with training samples data") ("testing-file,T", po::value<std::string>(&testFilename)->implicit_value("Datasets/ftest01.txt"), "provide filepath to file with testing samples data") ("classifier,c", po::value<std::string>(&classifierName)->implicit_value("Sequential_kNN"), "provide name of classifier to use") ("number-nn,k", po::value<int>(&k)->default_value(1), "provide number of neares (centroid) neighbours") ("largest-k-to-check,l", po::value<int>(&largestK)->default_value(0), "provide number of neares (centroid) neighbours") ("nr-load-train-samples", po::value<int>(&nrLoadTrainSamples)->default_value(0), "provide number of training samples to load, default (0) - all") ("nr-load-test-samples", po::value<int>(&nrLoadTestSamples)->default_value(0), "provide number of testing samples to load, default (0) - all") ("nr-load-sample-dims", po::value<int>(&nrLoadSampleDims)->default_value(0), "provide number of dimensions for each sample to load, default (0) - all") ("threshold", po::value<int>(&threshold)->default_value(-1), "provide threshold dimension for premature termination, default (-1) - all") ("percentmaxrobustrank", po::value<float>(&percentMaxRobustRank)->default_value(95.0), "provide percent of samples from training set for mRobustRank calculation in LimitedV1_kNCN and LimitedV2_kNCN, default (95.0) - all") ("nrSamplesInBlock", po::value<int>(&nrSamplesInBlock)->default_value(-1), "provide number of samples in block for CacheEfficient_kNCN, default (-1) - all samples") ; po::options_description cmdline_options; cmdline_options.add(generic).add(config); po::options_description config_file_options; config_file_options.add(config); po::options_description visible("Allowed options"); visible.add(generic).add(config); po::positional_options_description p; p.add("properties-file", -1); po::store(po::command_line_parser(argc, argv). options(cmdline_options).positional(p).run(), vars); po::notify(vars); if (vars.count("help")) { std::cout << visible; exit(0); } if (vars.count("cross-validate")) { std::cout << "File with data to cross-validate: " << toCrossValidateFilename + ".txt" << std::endl; std::ifstream toCrossValdiateFile(toCrossValidateFilename + ".txt"); if (!toCrossValdiateFile.is_open()) { std::cerr << "File with data to cross-validate does not exist." << std::endl; return false; } return true; } if (vars.count("properties-file")) { std::ifstream propertiesFile(propertiesFilename); if (!propertiesFile.is_open()) { std::cerr << "can not open config file: " << propertiesFilename << std::endl; vars.clear(); std::cerr << "Reading properties unsuccesful." << std::endl; return false; } else { store(parse_config_file(propertiesFile, config_file_options), vars); notify(vars); } } // Check file with training samples if (vars.count("training-file")) { std::cout << "Training file: " << trainFilename << std::endl; std::ifstream trainFile(trainFilename); if (!trainFile.is_open()) { std::cerr << "File with training samples data does not exist." << std::endl; return false; } } // Check file with testing samples if (vars.count("testing-file")) { std::cout << "Testing file: " << testFilename << std::endl; std::ifstream testFile(testFilename); if (!testFile.is_open()) { std::cerr << "File with test samples data does not exist" << std::endl; return false; } } std::cout << "Results file: " << resultFilename << std::endl; if (vars.count("standardize")) { isStandardizationEnabled = true; } EnumParser<ClassifierType> parser; classifier = parser.ParseEnum(classifierName); // Number of k Nearest Neighbours if (k < 1) { std::cerr << "Number of k Nearest Neighbours must be larger than or equal to 1" << std::endl; return false; } // Number of training samples to read if (nrLoadTrainSamples < 0) { std::cerr << "Number of training samples to load must be larger than 0. (Default: 0 == all)" << std::endl; return false; } // Number of testing samples to read if (nrLoadTestSamples < 0) { std::cerr << "Number of test samples to read must be larger than 0. (Default: 0 == all)" << std::endl; return false; } // Number of dimensions to read if (nrLoadSampleDims < 0) { std::cerr << "Number of test sample dims to read must be larger than 0. (Default: 0 == all)" << std::endl; return false; } } catch (std::exception& e) { std::cerr << e.what() << std::endl; vars.clear(); std::cerr << "Reading properties unsuccesful." << std::endl; return false; } std::cout << boost::format("Reading properties succesful.") << std::endl; return true; }
void HMM::trainGibbsFromFile(char* inputFile) { double **emit_count; double **trans_count; double *state_count; double *sequence_count; double *init_count; double *obs_count; emit_count = createMatrix(_numStates, _numObs); trans_count = createMatrix(_maxState, _numStates); state_count = new double[_numStates]; zeroArray(state_count, _numStates); obs_count = new double[_numObs]; zeroArray(obs_count, _numObs); sequence_count = new double[_maxState]; zeroArray(sequence_count, _maxState); init_count = new double[_maxState]; zeroArray(init_count, _maxState); ifstream trainFile(inputFile); string line; int count = 0; while (getline(trainFile, line)) { // for every sentence vector<int> words; stringstream ss(line); string buf; while (ss >> buf) { words.push_back(atoi(buf.c_str())); } int len = words.size(); count++; int *stateArray; stateArray = new int[len]; for (int i = 0; i < len; i++) { int origState = (rand() % (_numStates - 1)) + 1; int obs = words[i]; int prev_sequence = 0; int r = 0; stateArray[i] = origState; while ((r < _order) && (i - 1 - r) >= 0) { prev_sequence += stateArray[(i - 1) - r] * int(pow(_numStates, r)); r++; } obs_count[obs]++; state_count[origState]++; if (i == 0) init_count[origState]++; else { trans_count[prev_sequence][origState]++; sequence_count[prev_sequence]++; } emit_count[origState][obs]++; } int sampleN = rand() % len; for (int i = 0; i < sampleN; i++) { int k = rand() % (len - 1); int obs = words[k]; int prev_sequence = 0; int r = 0; // cout << "Compute prev_seq" << endl; while ((r < _order) && (k - 1 - r) >= 0) { prev_sequence += stateArray[(k - 1) - r] * int(pow(_numStates, r)); r++; } // cout << "Done Compute prev_seq" << endl; int origState = stateArray[k]; int nextState = stateArray[k + 1]; int next_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + origState; double *dist = new double[_numStates]; double totalp = 0; for (int state = 0; state < _numStates; state++) { int state_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + state; if (prev_sequence == 0) dist[state] = _pObservation[state][obs] * initial_probability[state] * _pTransition[state_sequence][nextState]; else dist[state] = _pObservation[state][obs] * _pTransition[prev_sequence][state] * _pTransition[state_sequence][nextState]; totalp += dist[state]; } renormalize(dist, _numStates); Distribution d(dist, _numStates); int sample = d.generate_sample(); delete[] dist; // cout << "Update params" << endl; state_count[origState]--; if (k == 0) { init_count[origState]--; init_count[sample]++; } else { trans_count[prev_sequence][origState]--; trans_count[prev_sequence][sample]++; } trans_count[next_sequence][nextState]--; sequence_count[next_sequence]--; emit_count[origState][obs]--; stateArray[k] = sample; next_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + sample; state_count[sample]++; trans_count[next_sequence][nextState]++; sequence_count[next_sequence]++; emit_count[sample][obs]++; // cout << "Done Update params" << endl; } }//end for every sentence trainFile.close(); updateHMM(emit_count, trans_count, state_count, sequence_count, init_count, obs_count); freeMatrix(trans_count, _maxState, _numStates); freeMatrix(emit_count, _numStates, _numObs); delete[] state_count; delete[] obs_count; delete[] sequence_count; }
void HMM::trainParallel(vector<string> filesList_) { int count = filesList_.size(); int rank = MPI::COMM_WORLD.Get_rank(); int size = MPI::COMM_WORLD.Get_size(); const int root = 0; int dist = count / size; int start = rank * dist; int end = rank * dist + dist; double **emit_count; double **trans_count; double *state_count; double *sequence_count; double *init_count; double *obs_count; emit_count = createMatrix(_numStates, _numObs); trans_count = createMatrix(_maxState, _numStates); state_count = new double[_numStates]; zeroArray(state_count, _numStates); obs_count = new double[_numObs]; zeroArray(obs_count, _numObs); sequence_count = new double[_maxState]; zeroArray(sequence_count, _maxState); init_count = new double[_maxState]; zeroArray(init_count, _maxState); double **temit_count; double **ttrans_count; double *tstate_count; double *tsequence_count; double *tinit_count; double *tobs_count; temit_count = createMatrix(_numStates, _numObs); ttrans_count = createMatrix(_maxState, _numStates); tstate_count = new double[_numStates]; tobs_count = new double[_numObs]; zeroArray(tstate_count, _numStates); tsequence_count = new double[_maxState]; zeroArray(tsequence_count, _maxState); tinit_count = new double[_maxState]; zeroArray(tinit_count, _maxState); for (int i = start; i < end; i++) { const char* inputFile = filesList_[i].c_str(); // cout << "opening file "<<files_list[i].c_str()<< " On Process " << rank<<endl; ifstream trainFile(inputFile); string line; while (getline(trainFile, line)) { // for every sentence // Read in training sequence vector<int> words; stringstream ss(line); string buf; while (ss >> buf) { words.push_back(atoi(buf.c_str())); } int len = words.size(); //COMPUTE FORWARD PROBABILITY double **forward; double * scaleArray = new double[len]; forward = createMatrix(len, _maxState); computeForwardMatrixScaled(words, forward, scaleArray, len); //printMatrix(forward, len, _maxState); //COMPUTE_BACKWARD PROBABILITY double **backward; backward = createMatrix(len, _maxState); computeBackwardMatrixScaled(words, backward, scaleArray, len); //printMatrix(backward, len, _maxState); //BAUM WELCH COUNTS for (int i = 0; i < len - 1; i++) { int obs = words[i]; int next_obs = words[i + 1]; for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { int state = state_sequence % _numStates; double gamma = (forward[i][state_sequence] * backward[i][state_sequence]); if (i == 0) { init_count[state_sequence] += gamma; } emit_count[state][obs] += gamma; obs_count[obs] += gamma; state_count[state] += gamma; sequence_count[state_sequence] += gamma; for (int next_state = 0; next_state < _numStates; next_state++) { int next_sequence = _numStates * (state_sequence % int(pow(_numStates, _order - 1))) + next_state; double eta = (forward[i][state_sequence] * _pTransition[state_sequence][next_state] * _pObservation[next_state][next_obs] * backward[i + 1][next_sequence]) / scaleArray[i + 1]; trans_count[state_sequence][next_state] += eta; } } } for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { int obs = words[len - 1]; int state = state_sequence % _numStates; double gamma = (forward[len - 1][state_sequence] * backward[len - 1][state_sequence]); emit_count[state][obs] += gamma; obs_count[obs] += gamma; state_count[state] += gamma; // sequence_count[state_sequence] += gamma; } delete[] scaleArray; freeMatrix(forward, len, _maxState); freeMatrix(backward, len, _maxState); }//end for every sentence trainFile.close(); // cout << " Training File Close, Updating Parameters " << inputFile << endl; } // for every file //Collect parameters on root for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { MPI_Reduce(trans_count[state_sequence], ttrans_count[state_sequence], _numStates, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); } MPI_Reduce(sequence_count, tsequence_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); MPI_Reduce(init_count, tinit_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); for (int state = 0; state < _numStates; state++) { MPI_Reduce(emit_count[state], temit_count[state], _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); } MPI_Reduce(state_count, tstate_count, _numStates, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); MPI_Reduce(obs_count, tobs_count, _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); //Send updated parameters too all children for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { MPI_Bcast(ttrans_count[state_sequence], _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD); } MPI_Bcast(tsequence_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(tinit_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD); for (int state = 0; state < _numStates; state++) { MPI_Bcast(temit_count[state], _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD); } MPI_Bcast(tstate_count, _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(tobs_count, _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD); //cout << "Update Step" << endl; updateHMM(temit_count, ttrans_count, tstate_count, tsequence_count, tinit_count, tobs_count); freeMatrix(trans_count, _maxState, _numStates); freeMatrix(emit_count, _numStates, _numObs); delete[] state_count; delete[] sequence_count; delete[] init_count; delete[] obs_count; freeMatrix(temit_count, _numStates, _numObs); freeMatrix(ttrans_count, _maxState, _numStates); delete[] tstate_count; delete[] tsequence_count; delete[] tinit_count; delete[] tobs_count; }
void HMM::trainFromFile(char* inputFile) { double **emit_count; double **trans_count; double *state_count; double *sequence_count; double *init_count; double *obs_count; emit_count = createMatrix(_numStates, _numObs); trans_count = createMatrix(_maxState, _numStates); state_count = new double[_numStates]; zeroArray(state_count, _numStates); obs_count = new double[_numObs]; zeroArray(obs_count, _numObs); sequence_count = new double[_maxState]; zeroArray(sequence_count, _maxState); init_count = new double[_maxState]; zeroArray(init_count, _maxState); ifstream trainFile(inputFile); string line; int count = 0; while (getline(trainFile, line)) { // for every sentence vector<int> words; stringstream ss(line); string buf; while (ss >> buf) { words.push_back(atoi(buf.c_str())); } int len = words.size(); count++; //COMPUTE FORWARD PROBABILITY double **forward; double * scaleArray = new double[len]; forward = createMatrix(len, _maxState); //double forward_prob = forwardAlgorithmScaled(words); //computeForwardMatrix(words, forward, len); //printMatrix(forward, len, _maxState); computeForwardMatrixScaled(words, forward, scaleArray, len); //printMatrix(forward, len, _maxState); //COMPUTE_BACKWARD PROBABILITY double **backward; backward = createMatrix(len, _maxState); //computeBackwardMatrix(words, backward, len); //printMatrix(backward, len, _maxState); computeBackwardMatrixScaled(words, backward, scaleArray, len); //printMatrix(backward, len, _maxState); //BAUM WELCH COUNTS for (int i = 0; i < len - 1; i++) { int obs = words[i]; int next_obs = words[i + 1]; for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { int state = state_sequence % _numStates; //cout << "for state" << state<<" "<<forward[i][state_sequence]<<"*"<<backward[i][state_sequence]<<endl; double gamma = (forward[i][state_sequence] * backward[i][state_sequence]); // if (gamma != gamma) // { // cout << "gammma problem" << endl; // printMatrix(forward, len, _maxState); // printMatrix(backward, len, _maxState); // printArray(scaleArray, len); // // return; // } if (i == 0) { _pTransition[0][state_sequence] += gamma; } emit_count[state][obs] += gamma; obs_count[obs] += gamma; state_count[state] += gamma; sequence_count[state_sequence] += gamma; for (int next_state = 0; next_state < _numStates; next_state++) { int next_sequence = _numStates * (state_sequence % int(pow(_numStates, _order - 1))) + next_state; double eta = (forward[i][state_sequence] * _pTransition[state_sequence][next_state] * _pObservation[next_state][next_obs] * backward[i + 1][next_sequence]) / scaleArray[i + 1]; trans_count[state_sequence][next_state] += eta; } } } for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { int obs = words[len - 1]; int state = state_sequence % _numStates; double gamma = (forward[len - 1][state_sequence] * backward[len - 1][state_sequence]); emit_count[state][obs] += gamma; obs_count[obs] += gamma; state_count[state] += gamma; } delete[] scaleArray; freeMatrix(forward, len, _maxState); freeMatrix(backward, len, _maxState); }//end for every sentence trainFile.close(); updateHMM(emit_count, trans_count, state_count, sequence_count, init_count, obs_count); freeMatrix(trans_count, _maxState, _numStates); freeMatrix(emit_count, _numStates, _numObs); delete[] state_count; delete[] obs_count; delete[] sequence_count; }
void HMM::trainGibbsParallel(vector<string> files_list) { int count = files_list.size(); int rank = MPI::COMM_WORLD.Get_rank(); int size = MPI::COMM_WORLD.Get_size(); const int root = 0; int dist = count / size; int start = rank * dist; int end = rank * dist + dist; double **emit_count; double **trans_count; double *state_count; double *sequence_count; double *init_count; double *obs_count; emit_count = createMatrix(_numStates, _numObs); trans_count = createMatrix(_maxState, _numStates); state_count = new double[_numStates]; zeroArray(state_count, _numStates); obs_count = new double[_numObs]; zeroArray(obs_count, _numObs); sequence_count = new double[_maxState]; zeroArray(sequence_count, _maxState); init_count = new double[_maxState]; zeroArray(init_count, _maxState); double **temit_count; double **ttrans_count; double *tstate_count; double *tsequence_count; double *tinit_count; double *tobs_count; temit_count = createMatrix(_numStates, _numObs); ttrans_count = createMatrix(_maxState, _numStates); tstate_count = new double[_numStates]; zeroArray(tstate_count, _numStates); tobs_count = new double[_numObs]; zeroArray(tobs_count, _numObs); tsequence_count = new double[_maxState]; zeroArray(tsequence_count, _maxState); tinit_count = new double[_maxState]; zeroArray(tinit_count, _maxState); for (int i = start; i < end; i++) { const char* inputFile = files_list[i].c_str(); ifstream trainFile(inputFile); string line; while (getline(trainFile, line)) { // for every sentence // Read in training sequence vector<int> words; stringstream ss(line); string buf; while (ss >> buf) { words.push_back(atoi(buf.c_str())); } int len = words.size(); count++; int *stateArray; stateArray = new int[len]; for (int i = 0; i < len; i++) { int origState = (rand() % (_numStates - 1)) + 1; int obs = words[i]; int prev_sequence = 0; int r = 0; stateArray[i] = origState; if (i == 0) init_count[origState]++; else { while ((r < _order) && (i - 1 - r) >= 0) { prev_sequence += stateArray[(i - 1) - r] * int(pow(_numStates, r)); r++; } trans_count[prev_sequence][origState]++; sequence_count[prev_sequence]++; } obs_count[obs]++; state_count[origState]++; emit_count[origState][obs]++; } int sampleN = rand() % len; for (int i = 0; i < sampleN; i++) { int k = rand() % (len - 1); int obs = words[k]; int prev_sequence = 0; int r = 0; // cout << "Compute seq " <<endl; while ((r < _order) && (k - 1 - r) >= 0) { prev_sequence += stateArray[(k - 1) - r] * int(pow(_numStates, r)); r++; } // cout << "Done Compute seq " <<endl; int origState = stateArray[k]; int nextState = stateArray[k + 1]; int next_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + nextState; double *dist = new double[_numStates]; double totalp = 0; for (int state = 0; state < _numStates; state++) { int state_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + state; if (prev_sequence == 0) dist[state] = _pObservation[state][obs] * initial_probability[state] * _pTransition[state_sequence][nextState]; else dist[state] = _pObservation[state][obs] * _pTransition[prev_sequence][state] * _pTransition[state_sequence][nextState]; totalp += dist[state]; } renormalize(dist, _numStates); Distribution d(dist, _numStates); int sample = d.generate_sample(); delete[] dist; if (k == 0) { init_count[origState]--; init_count[sample]++; } else { trans_count[prev_sequence][origState]--; trans_count[prev_sequence][sample]++; } state_count[origState]--; // trans_count[next_sequence][nextState]--; // sequence_count[next_sequence]--; emit_count[origState][obs]--; stateArray[k] = sample; // next_sequence = _numStates*(prev_sequence % int(pow(_numStates, _order-1))) + sample; state_count[sample]++; // trans_count[next_sequence][nextState]++; // sequence_count[next_sequence]++; emit_count[sample][obs]++; // cout << "Done Update parameters" << endl; } }//end for every sentence trainFile.close(); } // for every file //Collect parameters on root for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { MPI_Reduce(trans_count[state_sequence], ttrans_count[state_sequence], _numStates, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); } MPI_Reduce(sequence_count, tsequence_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); MPI_Reduce(init_count, tinit_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); for (int state = 0; state < _numStates; state++) { MPI_Reduce(emit_count[state], temit_count[state], _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); } MPI_Reduce(state_count, tstate_count, _numStates, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); MPI_Reduce(obs_count, tobs_count, _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD); //Send updated parameters too all children for (int state_sequence = 0; state_sequence < _maxState; state_sequence++) { MPI_Bcast(ttrans_count[state_sequence], _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD); } MPI_Bcast(tsequence_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(tinit_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD); for (int state = 0; state < _numStates; state++) { MPI_Bcast(temit_count[state], _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD); } MPI_Bcast(tstate_count, _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD); MPI_Bcast(tobs_count, _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD); //cout << "Update Step" << endl; updateHMM(temit_count, ttrans_count, tstate_count, tsequence_count, tinit_count, tobs_count); freeMatrix(trans_count, _maxState, _numStates); freeMatrix(emit_count, _numStates, _numObs); delete[] state_count; delete[] sequence_count; delete[] init_count; delete[] obs_count; freeMatrix(temit_count, _numStates, _numObs); freeMatrix(ttrans_count, _maxState, _numStates); delete[] tstate_count; delete[] tsequence_count; delete[] tinit_count; delete[] tobs_count; }
int main(int argc, char *argv[]) { QCoreApplication a(argc, argv); QStringList args = a.arguments(); if (args.size() < 3) { QStringList usage; usage << args.at(0) << "[train data]" << "[test data]"; qFatal("Too few arguments. Usage:\n%s\n", usage.join(" ").toStdString().c_str()); } QFile trainFile(args.at(1)); if (!trainFile.open(QIODevice::ReadOnly)) { qFatal("Failed to open train file %s.\n", trainFile.fileName().toStdString().c_str()); } QFile testFile(args.at(2)); if (!testFile.open(QIODevice::ReadOnly)) { qFatal("Failed to open test file %s.\n", testFile.fileName().toStdString().c_str()); } QElapsedTimer loadTimer; loadTimer.start(); FeatureImporter trainFeatures; FeatureImporter testFeatures; #pragma omp sections { #pragma omp section { trainFeatures.open(&trainFile); } #pragma omp section { testFeatures.open(&testFile); } } int loadMsecs = loadTimer.elapsed(); qDebug() << "loading took" << loadMsecs << "msecs"; trainFile.close(); testFile.close(); QVector<QString> hash; QVector<qint8> trainClasses; for (int i = 0; i < trainFeatures.labels().size(); i++) { qint8 index = hash.indexOf(trainFeatures.labels().at(i)); if (index == -1) { QString dbg("Appending label \"%1\" to hash at position %2. It has now value \"%3\""); hash.append(trainFeatures.labels().at(i)); index = hash.size() - 1; //qDebug() << dbg.arg(trainFeatures.labels().at(i), QString::number(index), hash.at(index)); } trainClasses.append(index); } ClassifierInterface *ci = new CpuClassifier(); QVector<QVector<int> > classes; qDebug() << "starting classification"; QList<int> k; bool ok = true; int i = 50; if (args.size() >= 4) { i = qMax(0, args.at(3).toInt(&ok)); } else { ok = false; } if (!ok) { qDebug() << "no k given, assuming k = 50"; i = 50; } qDebug() << "initial k:" << i; for (; i >= 1; i--) { k.append(i); } QElapsedTimer timer; timer.start(); classes = ci->classify(trainFeatures.features(), testFeatures.features(), trainClasses.constData(), NULL, testFeatures.featuresPerItem(), trainFeatures.itemCount(), testFeatures.itemCount(), k); delete ci; int msecs = timer.elapsed(); qDebug() << "calculations took" << msecs << "msecs"; for (int w = 0; w < classes.size(); w++) { int correct = 0; QVector<QVector<qreal> > confusionMatrix; confusionMatrix.resize(hash.size()); for (int i = 0; i < confusionMatrix.size(); i++) { confusionMatrix[i].resize(hash.size()); } for (int i = 0; i < classes.at(w).size(); i++) { /*qDebug() << i; qDebug() << classes.at(i); qDebug() << hash.at(classes.at(i)); qDebug() << testFeatures.labels().at(i);*/ confusionMatrix[hash.indexOf(testFeatures.labels().at(i))][classes.at(w).at(i)]++; /*if (hash.at(classes.at(w).at(i)) == QString("5")) { qDebug() << "is 5, should be " << testFeatures.labels().at(i); }*/ if (hash.at(classes.at(w).at(i)) == testFeatures.labels().at(i)) { correct++; } } QVector<QPair<QString, int> > sorter; for (int i = 0; i < hash.size(); i++) { sorter << qMakePair(hash.at(i), i); } qSort(sorter); QStringList l; for (int i = 0; i < hash.size(); i++) { l << sorter.at(i).first; } QVector<QVector<qreal> > tempConfusionMatrix; tempConfusionMatrix.resize(hash.size()); for (int j = 0; j < confusionMatrix.size(); j++) { for (int i = 0; i < sorter.size(); i++) { tempConfusionMatrix[j] << confusionMatrix.at(j).at(sorter.at(i).second); } } confusionMatrix = tempConfusionMatrix; for (int j = 0; j < confusionMatrix.size(); j++) { tempConfusionMatrix[j] = confusionMatrix.at(sorter.at(j).second); } confusionMatrix = tempConfusionMatrix; #ifdef PERCENTAGE_CONFUSION for (int i = 0; i < confusionMatrix.size(); i++) { qreal sum = 0; for (int j = 0; j < confusionMatrix.at(i).size(); j++) { sum += confusionMatrix.at(j).at(i); } for (int j = 0; j < confusionMatrix.at(i).size(); j++) { confusionMatrix[j][i] = confusionMatrix.at(j).at(i) / sum * 100.0; } } #endif QTextStream stream(stdout); stream << "k: " << k.at(w) << endl; stream << "\t&\t" << l.join("\t&\t") << "\\\\" << endl; for (int i = 0; i < confusionMatrix.size(); i++) { QStringList list; list << sorter.at(i).first; for (int j = 0; j < confusionMatrix.size(); j++) { list << QString::number(confusionMatrix[i][j], 'g', 4); } const QString joined(list.join("\t&\t")); stream << joined << "\\\\" << endl; } stream << "correct: " << ((float)correct / (float)classes.at(w).size()) * 100 << "%" << endl; } msecs = timer.elapsed(); qDebug() << "everything took" << msecs << "msecs"; return 0; }