int main()
{
    fstream trainFile("train.csv");
    vector<vector<double> > data;
    CSV::readDataFromFile(trainFile, data, rowLength);
    std::cout << "Done reading train data. " << data.size() << " records" << std::endl;

    classifier.initContain(data);
    classifier.classify(data);
    std::cout << "Done classifying" << std::endl;
    regressionRunner.init(data, classifier, rowLength - 1);
    std::cout << "Done initializing regression runner" << std::endl;

    classifier.save();
    std::cout << "Done saving classifier" << std::endl;
    regressionRunner.saveData();
    std::cout << "Done saving regression data" << std::endl;
    return 0;
}
Пример #2
0
	bool InputReader::readInput(int argc, char** argv) {
		try {
			po::options_description generic("Generic options");
			generic.add_options()
				("help", "produce help message")
				("properties-file,p", po::value<std::string>(&propertiesFilename)->implicit_value(""),
				"provide filepath to file with properties")
				("cross-validate,C", po::value<std::string>(&toCrossValidateFilename)->implicit_value(""),	"10-fold cross-validate provided dataset")
				("standardize,S", "standardize data in provided training and testing datasets")
				;

			po::options_description config("Configuration options");
			config.add_options()
				("result-file,r", po::value<std::string>(&resultFilename)->implicit_value("Results/results_knn.txt"),
				"provide filepath to file where results should be stored")
				("training-file,t", po::value<std::string>(&trainFilename)->implicit_value("Datasets/ftrain01.txt"),
				"provide filepath to file with training samples data")
				("testing-file,T", po::value<std::string>(&testFilename)->implicit_value("Datasets/ftest01.txt"),
				"provide filepath to file with testing samples data")
				("classifier,c", po::value<std::string>(&classifierName)->implicit_value("Sequential_kNN"),
				"provide name of classifier to use")
				("number-nn,k", po::value<int>(&k)->default_value(1),
				"provide number of neares (centroid) neighbours")
				("largest-k-to-check,l", po::value<int>(&largestK)->default_value(0),
				"provide number of neares (centroid) neighbours")
				("nr-load-train-samples", po::value<int>(&nrLoadTrainSamples)->default_value(0),
				"provide number of training samples to load, default (0) - all")
				("nr-load-test-samples", po::value<int>(&nrLoadTestSamples)->default_value(0),
				"provide number of testing samples to load, default (0) - all")
				("nr-load-sample-dims", po::value<int>(&nrLoadSampleDims)->default_value(0),
				"provide number of dimensions for each sample to load, default (0) - all")
				("threshold", po::value<int>(&threshold)->default_value(-1),
				"provide threshold dimension for premature termination, default (-1) - all")
				("percentmaxrobustrank", po::value<float>(&percentMaxRobustRank)->default_value(95.0),
				"provide percent of samples from training set for mRobustRank calculation in LimitedV1_kNCN and LimitedV2_kNCN, default (95.0) - all")
				("nrSamplesInBlock", po::value<int>(&nrSamplesInBlock)->default_value(-1),
				"provide number of samples in block for CacheEfficient_kNCN, default (-1) - all samples")
				;

			po::options_description cmdline_options;
			cmdline_options.add(generic).add(config);

			po::options_description config_file_options;
			config_file_options.add(config);

			po::options_description visible("Allowed options");
			visible.add(generic).add(config);

			po::positional_options_description p;
			p.add("properties-file", -1);

			po::store(po::command_line_parser(argc, argv).
				options(cmdline_options).positional(p).run(), vars);
			po::notify(vars);

			if (vars.count("help")) {
				std::cout << visible;
				exit(0);
			}

			if (vars.count("cross-validate")) {
				std::cout << "File with data to cross-validate: " << toCrossValidateFilename  + ".txt" << std::endl;
				std::ifstream toCrossValdiateFile(toCrossValidateFilename + ".txt");
				if (!toCrossValdiateFile.is_open()) {
					std::cerr << "File with data to cross-validate does not exist." << std::endl;
					return false;
				}
				return true;
			}

			if (vars.count("properties-file")) {
				std::ifstream propertiesFile(propertiesFilename);
				if (!propertiesFile.is_open()) {
					std::cerr << "can not open config file: " << propertiesFilename << std::endl;
					vars.clear();
					std::cerr << "Reading properties unsuccesful." << std::endl;
					return false;
				} else {
					store(parse_config_file(propertiesFile, config_file_options), vars);
					notify(vars);
				}
			}

			// Check file with training samples
			if (vars.count("training-file")) {
				std::cout << "Training file: " << trainFilename << std::endl;
				std::ifstream trainFile(trainFilename);
				if (!trainFile.is_open()) {
					std::cerr << "File with training samples data does not exist." << std::endl;
					return false;
				}
			}

			// Check file with testing samples
			if (vars.count("testing-file")) {
				std::cout << "Testing file: " << testFilename << std::endl;
				std::ifstream testFile(testFilename);
				if (!testFile.is_open()) {
					std::cerr << "File with test samples data does not exist" << std::endl;
					return false;
				}
			}

			std::cout << "Results file: " << resultFilename << std::endl;

			if (vars.count("standardize")) {
				isStandardizationEnabled = true;
			}

			EnumParser<ClassifierType> parser;
			classifier = parser.ParseEnum(classifierName);

			// Number of k Nearest Neighbours
			if (k < 1) {
				std::cerr << "Number of k Nearest Neighbours must be larger than or equal to 1" << std::endl;
				return false;
			}

			// Number of training samples to read
			if (nrLoadTrainSamples < 0) {
				std::cerr << "Number of training samples to load must be larger than 0. (Default: 0 == all)" << std::endl;
				return false;
			}

			// Number of testing samples to read
			if (nrLoadTestSamples < 0) {
				std::cerr << "Number of test samples to read must be larger than 0. (Default: 0 == all)" << std::endl;
				return false;
			}
			// Number of dimensions to read
			if (nrLoadSampleDims < 0) {
				std::cerr << "Number of test sample dims to read must be larger than 0. (Default: 0 == all)" << std::endl;
				return false;
			}
		} catch (std::exception& e) {
			std::cerr << e.what() << std::endl;
			vars.clear();
			std::cerr << "Reading properties unsuccesful." << std::endl;
			return false;
		}

		std::cout << boost::format("Reading properties succesful.") << std::endl;
		return true;
	}
Пример #3
0
void HMM::trainGibbsFromFile(char* inputFile)
{

	double **emit_count;
	double **trans_count;
	double *state_count;
	double *sequence_count;
	double *init_count;
	double *obs_count;

	emit_count = createMatrix(_numStates, _numObs);
	trans_count = createMatrix(_maxState, _numStates);
	state_count = new double[_numStates];
	zeroArray(state_count, _numStates);
	obs_count = new double[_numObs];
	zeroArray(obs_count, _numObs);
	sequence_count = new double[_maxState];
	zeroArray(sequence_count, _maxState);
	init_count = new double[_maxState];
	zeroArray(init_count, _maxState);

	ifstream trainFile(inputFile);
	string line;
	int count = 0;
	while (getline(trainFile, line))
	{ // for every sentence
		vector<int> words;
		stringstream ss(line);
		string buf;

		while (ss >> buf)
		{
			words.push_back(atoi(buf.c_str()));
		}
		int len = words.size();
		count++;
		int *stateArray;
		stateArray = new int[len];
		for (int i = 0; i < len; i++)
		{
			int origState = (rand() % (_numStates - 1)) + 1;
			int obs = words[i];
			int prev_sequence = 0;
			int r = 0;
			stateArray[i] = origState;
			while ((r < _order) && (i - 1 - r) >= 0)
			{
				prev_sequence += stateArray[(i - 1) - r] * int(pow(_numStates, r));
				r++;
			}
			obs_count[obs]++;
			state_count[origState]++;
			if (i == 0)
				init_count[origState]++;
			else
			{
				trans_count[prev_sequence][origState]++;
				sequence_count[prev_sequence]++;
			}
			emit_count[origState][obs]++;

		}
		int sampleN = rand() % len;
		for (int i = 0; i < sampleN; i++)
		{
			int k = rand() % (len - 1);
			int obs = words[k];
			int prev_sequence = 0;
			int r = 0;
			//	cout << "Compute prev_seq" << endl;
			while ((r < _order) && (k - 1 - r) >= 0)
			{
				prev_sequence += stateArray[(k - 1) - r] * int(pow(_numStates, r));
				r++;
			}
			//	cout << "Done Compute prev_seq" << endl;
			int origState = stateArray[k];
			int nextState = stateArray[k + 1];
			int next_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + origState;
			double *dist = new double[_numStates];
			double totalp = 0;
			for (int state = 0; state < _numStates; state++)
			{
				int state_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + state;
				if (prev_sequence == 0)
					dist[state] = _pObservation[state][obs] * initial_probability[state]
							* _pTransition[state_sequence][nextState];
				else
					dist[state] = _pObservation[state][obs] * _pTransition[prev_sequence][state]
							* _pTransition[state_sequence][nextState];
				totalp += dist[state];
			}
			renormalize(dist, _numStates);
			Distribution d(dist, _numStates);
			int sample = d.generate_sample();
			delete[] dist;

			//	cout << "Update params" << endl;
			state_count[origState]--;
			if (k == 0)
			{
				init_count[origState]--;
				init_count[sample]++;
			}
			else
			{
				trans_count[prev_sequence][origState]--;
				trans_count[prev_sequence][sample]++;
			}
			trans_count[next_sequence][nextState]--;
			sequence_count[next_sequence]--;
			emit_count[origState][obs]--;
			stateArray[k] = sample;
			next_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + sample;
			state_count[sample]++;
			trans_count[next_sequence][nextState]++;
			sequence_count[next_sequence]++;
			emit_count[sample][obs]++;

			//	cout << "Done Update params" << endl;

		}
	}//end for every sentence
	trainFile.close();
	updateHMM(emit_count, trans_count, state_count, sequence_count, init_count, obs_count);

	freeMatrix(trans_count, _maxState, _numStates);
	freeMatrix(emit_count, _numStates, _numObs);
	delete[] state_count;
	delete[] obs_count;
	delete[] sequence_count;

}
Пример #4
0
void HMM::trainParallel(vector<string> filesList_)
{

	int count = filesList_.size();
	int rank = MPI::COMM_WORLD.Get_rank();
	int size = MPI::COMM_WORLD.Get_size();
	const int root = 0;
	int dist = count / size;
	int start = rank * dist;
	int end = rank * dist + dist;

	double **emit_count;
	double **trans_count;
	double *state_count;
	double *sequence_count;
	double *init_count;
	double *obs_count;
	emit_count = createMatrix(_numStates, _numObs);
	trans_count = createMatrix(_maxState, _numStates);
	state_count = new double[_numStates];
	zeroArray(state_count, _numStates);
	obs_count = new double[_numObs];
	zeroArray(obs_count, _numObs);
	sequence_count = new double[_maxState];
	zeroArray(sequence_count, _maxState);
	init_count = new double[_maxState];
	zeroArray(init_count, _maxState);

	double **temit_count;
	double **ttrans_count;
	double *tstate_count;
	double *tsequence_count;
	double *tinit_count;
	double *tobs_count;

	temit_count = createMatrix(_numStates, _numObs);
	ttrans_count = createMatrix(_maxState, _numStates);
	tstate_count = new double[_numStates];
	tobs_count = new double[_numObs];
	zeroArray(tstate_count, _numStates);
	tsequence_count = new double[_maxState];
	zeroArray(tsequence_count, _maxState);
	tinit_count = new double[_maxState];
	zeroArray(tinit_count, _maxState);

	for (int i = start; i < end; i++)
	{
		const char* inputFile = filesList_[i].c_str();
		//		cout << "opening file "<<files_list[i].c_str()<< " On Process " << rank<<endl;
		ifstream trainFile(inputFile);
		string line;
		while (getline(trainFile, line))
		{ // for every sentence

			// Read in training sequence
			vector<int> words;
			stringstream ss(line);
			string buf;
			while (ss >> buf)
			{
				words.push_back(atoi(buf.c_str()));
			}
			int len = words.size();

			//COMPUTE FORWARD PROBABILITY
			double **forward;
			double * scaleArray = new double[len];
			forward = createMatrix(len, _maxState);
			computeForwardMatrixScaled(words, forward, scaleArray, len);
			//printMatrix(forward, len, _maxState);

			//COMPUTE_BACKWARD PROBABILITY
			double **backward;
			backward = createMatrix(len, _maxState);
			computeBackwardMatrixScaled(words, backward, scaleArray, len);
			//printMatrix(backward, len, _maxState);

			//BAUM WELCH COUNTS
			for (int i = 0; i < len - 1; i++)
			{
				int obs = words[i];
				int next_obs = words[i + 1];
				for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
				{
					int state = state_sequence % _numStates;
					double gamma = (forward[i][state_sequence] * backward[i][state_sequence]);
					if (i == 0)
					{
						init_count[state_sequence] += gamma;
					}
					emit_count[state][obs] += gamma;
					obs_count[obs] += gamma;
					state_count[state] += gamma;
					sequence_count[state_sequence] += gamma;
					for (int next_state = 0; next_state < _numStates; next_state++)
					{
						int next_sequence = _numStates * (state_sequence % int(pow(_numStates, _order - 1)))
								+ next_state;
						double eta = (forward[i][state_sequence] * _pTransition[state_sequence][next_state]
								* _pObservation[next_state][next_obs] * backward[i + 1][next_sequence]) / scaleArray[i
								+ 1];
						trans_count[state_sequence][next_state] += eta;
					}
				}
			}
			for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
			{
				int obs = words[len - 1];
				int state = state_sequence % _numStates;
				double gamma = (forward[len - 1][state_sequence] * backward[len - 1][state_sequence]);
				emit_count[state][obs] += gamma;
				obs_count[obs] += gamma;
				state_count[state] += gamma;
				//	sequence_count[state_sequence] += gamma;
			}
			delete[] scaleArray;
			freeMatrix(forward, len, _maxState);
			freeMatrix(backward, len, _maxState);
		}//end for every sentence

		trainFile.close();
		//	cout << " Training File Close, Updating Parameters " << inputFile << endl;
	} // for every file


	//Collect parameters on root
	for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
	{
		MPI_Reduce(trans_count[state_sequence], ttrans_count[state_sequence], _numStates, MPI_DOUBLE, MPI_SUM, root,
				MPI_COMM_WORLD);
	}
	MPI_Reduce(sequence_count, tsequence_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	MPI_Reduce(init_count, tinit_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	for (int state = 0; state < _numStates; state++)
	{
		MPI_Reduce(emit_count[state], temit_count[state], _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	}
	MPI_Reduce(state_count, tstate_count, _numStates, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	MPI_Reduce(obs_count, tobs_count, _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

	//Send updated parameters too all children
	for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
	{
		MPI_Bcast(ttrans_count[state_sequence], _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD);
	}
	MPI_Bcast(tsequence_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(tinit_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD);
	for (int state = 0; state < _numStates; state++)
	{
		MPI_Bcast(temit_count[state], _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD);
	}
	MPI_Bcast(tstate_count, _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(tobs_count, _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD);

	//cout << "Update Step" << endl;
	updateHMM(temit_count, ttrans_count, tstate_count, tsequence_count, tinit_count, tobs_count);

	freeMatrix(trans_count, _maxState, _numStates);
	freeMatrix(emit_count, _numStates, _numObs);
	delete[] state_count;
	delete[] sequence_count;
	delete[] init_count;
	delete[] obs_count;

	freeMatrix(temit_count, _numStates, _numObs);
	freeMatrix(ttrans_count, _maxState, _numStates);
	delete[] tstate_count;
	delete[] tsequence_count;
	delete[] tinit_count;
	delete[] tobs_count;

}
Пример #5
0
void HMM::trainFromFile(char* inputFile)
{

	double **emit_count;
	double **trans_count;
	double *state_count;
	double *sequence_count;
	double *init_count;
	double *obs_count;
	emit_count = createMatrix(_numStates, _numObs);
	trans_count = createMatrix(_maxState, _numStates);
	state_count = new double[_numStates];
	zeroArray(state_count, _numStates);
	obs_count = new double[_numObs];
	zeroArray(obs_count, _numObs);
	sequence_count = new double[_maxState];
	zeroArray(sequence_count, _maxState);
	init_count = new double[_maxState];
	zeroArray(init_count, _maxState);

	ifstream trainFile(inputFile);
	string line;
	int count = 0;
	while (getline(trainFile, line))
	{ // for every sentence
		vector<int> words;
		stringstream ss(line);
		string buf;

		while (ss >> buf)
		{
			words.push_back(atoi(buf.c_str()));
		}
		int len = words.size();
		count++;
		//COMPUTE FORWARD PROBABILITY
		double **forward;
		double * scaleArray = new double[len];
		forward = createMatrix(len, _maxState);
		//double forward_prob = forwardAlgorithmScaled(words);
		//computeForwardMatrix(words, forward,  len);
		//printMatrix(forward, len, _maxState);
		computeForwardMatrixScaled(words, forward, scaleArray, len);
		//printMatrix(forward, len, _maxState);

		//COMPUTE_BACKWARD PROBABILITY
		double **backward;
		backward = createMatrix(len, _maxState);
		//computeBackwardMatrix(words, backward,  len);
		//printMatrix(backward, len, _maxState);
		computeBackwardMatrixScaled(words, backward, scaleArray, len);
		//printMatrix(backward, len, _maxState);

		//BAUM WELCH COUNTS
		for (int i = 0; i < len - 1; i++)
		{
			int obs = words[i];
			int next_obs = words[i + 1];
			for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
			{
				int state = state_sequence % _numStates;
				//cout << "for state" << state<<" "<<forward[i][state_sequence]<<"*"<<backward[i][state_sequence]<<endl;
				double gamma = (forward[i][state_sequence] * backward[i][state_sequence]);
				//				if (gamma != gamma)
				//				{
				//					cout << "gammma problem" << endl;
				//					printMatrix(forward, len, _maxState);
				//					printMatrix(backward, len, _maxState);
				//					printArray(scaleArray, len);
				//
				//					return;
				//				}
				if (i == 0)
				{
					_pTransition[0][state_sequence] += gamma;
				}
				emit_count[state][obs] += gamma;
				obs_count[obs] += gamma;
				state_count[state] += gamma;
				sequence_count[state_sequence] += gamma;
				for (int next_state = 0; next_state < _numStates; next_state++)
				{
					int next_sequence = _numStates * (state_sequence % int(pow(_numStates, _order - 1))) + next_state;
					double eta = (forward[i][state_sequence] * _pTransition[state_sequence][next_state]
							* _pObservation[next_state][next_obs] * backward[i + 1][next_sequence]) / scaleArray[i + 1];
					trans_count[state_sequence][next_state] += eta;
				}
			}
		}
		for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
		{
			int obs = words[len - 1];
			int state = state_sequence % _numStates;
			double gamma = (forward[len - 1][state_sequence] * backward[len - 1][state_sequence]);
			emit_count[state][obs] += gamma;
			obs_count[obs] += gamma;
			state_count[state] += gamma;
		}
		delete[] scaleArray;
		freeMatrix(forward, len, _maxState);
		freeMatrix(backward, len, _maxState);

	}//end for every sentence
	trainFile.close();
	updateHMM(emit_count, trans_count, state_count, sequence_count, init_count, obs_count);

	freeMatrix(trans_count, _maxState, _numStates);
	freeMatrix(emit_count, _numStates, _numObs);
	delete[] state_count;
	delete[] obs_count;
	delete[] sequence_count;

}
Пример #6
0
void HMM::trainGibbsParallel(vector<string> files_list)
{

	int count = files_list.size();
	int rank = MPI::COMM_WORLD.Get_rank();
	int size = MPI::COMM_WORLD.Get_size();
	const int root = 0;
	int dist = count / size;
	int start = rank * dist;
	int end = rank * dist + dist;

	double **emit_count;
	double **trans_count;
	double *state_count;
	double *sequence_count;
	double *init_count;
	double *obs_count;
	emit_count = createMatrix(_numStates, _numObs);
	trans_count = createMatrix(_maxState, _numStates);

	state_count = new double[_numStates];
	zeroArray(state_count, _numStates);

	obs_count = new double[_numObs];
	zeroArray(obs_count, _numObs);

	sequence_count = new double[_maxState];
	zeroArray(sequence_count, _maxState);

	init_count = new double[_maxState];
	zeroArray(init_count, _maxState);

	double **temit_count;
	double **ttrans_count;
	double *tstate_count;
	double *tsequence_count;
	double *tinit_count;
	double *tobs_count;

	temit_count = createMatrix(_numStates, _numObs);
	ttrans_count = createMatrix(_maxState, _numStates);

	tstate_count = new double[_numStates];
	zeroArray(tstate_count, _numStates);

	tobs_count = new double[_numObs];
	zeroArray(tobs_count, _numObs);

	tsequence_count = new double[_maxState];
	zeroArray(tsequence_count, _maxState);

	tinit_count = new double[_maxState];
	zeroArray(tinit_count, _maxState);

	for (int i = start; i < end; i++)
	{
		const char* inputFile = files_list[i].c_str();
		ifstream trainFile(inputFile);
		string line;
		while (getline(trainFile, line))
		{ // for every sentence

			// Read in training sequence
			vector<int> words;
			stringstream ss(line);
			string buf;

			while (ss >> buf)
			{
				words.push_back(atoi(buf.c_str()));
			}
			int len = words.size();
			count++;
			int *stateArray;
			stateArray = new int[len];
			for (int i = 0; i < len; i++)
			{
				int origState = (rand() % (_numStates - 1)) + 1;
				int obs = words[i];
				int prev_sequence = 0;
				int r = 0;
				stateArray[i] = origState;

				if (i == 0)
					init_count[origState]++;
				else
				{
					while ((r < _order) && (i - 1 - r) >= 0)
					{
						prev_sequence += stateArray[(i - 1) - r] * int(pow(_numStates, r));
						r++;
					}
					trans_count[prev_sequence][origState]++;
					sequence_count[prev_sequence]++;
				}
				obs_count[obs]++;
				state_count[origState]++;
				emit_count[origState][obs]++;

			}
			int sampleN = rand() % len;
			for (int i = 0; i < sampleN; i++)
			{
				int k = rand() % (len - 1);
				int obs = words[k];
				int prev_sequence = 0;
				int r = 0;
				//		cout << "Compute seq " <<endl;
				while ((r < _order) && (k - 1 - r) >= 0)
				{
					prev_sequence += stateArray[(k - 1) - r] * int(pow(_numStates, r));
					r++;
				}
				//		cout << "Done Compute seq " <<endl;
				int origState = stateArray[k];
				int nextState = stateArray[k + 1];
				int next_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + nextState;
				double *dist = new double[_numStates];
				double totalp = 0;
				for (int state = 0; state < _numStates; state++)
				{
					int state_sequence = _numStates * (prev_sequence % int(pow(_numStates, _order - 1))) + state;
					if (prev_sequence == 0)
						dist[state] = _pObservation[state][obs] * initial_probability[state]
								* _pTransition[state_sequence][nextState];
					else
						dist[state] = _pObservation[state][obs] * _pTransition[prev_sequence][state]
								* _pTransition[state_sequence][nextState];
					totalp += dist[state];
				}
				renormalize(dist, _numStates);
				Distribution d(dist, _numStates);
				int sample = d.generate_sample();
				delete[] dist;

				if (k == 0)
				{
					init_count[origState]--;
					init_count[sample]++;
				}
				else
				{
					trans_count[prev_sequence][origState]--;
					trans_count[prev_sequence][sample]++;
				}
				state_count[origState]--;
				//			trans_count[next_sequence][nextState]--;
				//			sequence_count[next_sequence]--;
				emit_count[origState][obs]--;
				stateArray[k] = sample;
				//			next_sequence = _numStates*(prev_sequence  % int(pow(_numStates, _order-1))) + sample;
				state_count[sample]++;
				//			trans_count[next_sequence][nextState]++;
				//			sequence_count[next_sequence]++;
				emit_count[sample][obs]++;
				//		cout << "Done Update parameters" << endl;
			}
		}//end for every sentence

		trainFile.close();

	} // for every file

	//Collect parameters on root
	for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
	{
		MPI_Reduce(trans_count[state_sequence], ttrans_count[state_sequence], _numStates, MPI_DOUBLE, MPI_SUM, root,
				MPI_COMM_WORLD);
	}
	MPI_Reduce(sequence_count, tsequence_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	MPI_Reduce(init_count, tinit_count, _maxState, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	for (int state = 0; state < _numStates; state++)
	{
		MPI_Reduce(emit_count[state], temit_count[state], _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	}
	MPI_Reduce(state_count, tstate_count, _numStates, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);
	MPI_Reduce(obs_count, tobs_count, _numObs, MPI_DOUBLE, MPI_SUM, root, MPI_COMM_WORLD);

	//Send updated parameters too all children
	for (int state_sequence = 0; state_sequence < _maxState; state_sequence++)
	{
		MPI_Bcast(ttrans_count[state_sequence], _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD);
	}
	MPI_Bcast(tsequence_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(tinit_count, _maxState, MPI_DOUBLE, root, MPI_COMM_WORLD);
	for (int state = 0; state < _numStates; state++)
	{
		MPI_Bcast(temit_count[state], _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD);
	}
	MPI_Bcast(tstate_count, _numStates, MPI_DOUBLE, root, MPI_COMM_WORLD);
	MPI_Bcast(tobs_count, _numObs, MPI_DOUBLE, root, MPI_COMM_WORLD);

	//cout << "Update Step" << endl;
	updateHMM(temit_count, ttrans_count, tstate_count, tsequence_count, tinit_count, tobs_count);

	freeMatrix(trans_count, _maxState, _numStates);
	freeMatrix(emit_count, _numStates, _numObs);
	delete[] state_count;
	delete[] sequence_count;
	delete[] init_count;
	delete[] obs_count;

	freeMatrix(temit_count, _numStates, _numObs);
	freeMatrix(ttrans_count, _maxState, _numStates);
	delete[] tstate_count;
	delete[] tsequence_count;
	delete[] tinit_count;
	delete[] tobs_count;
}
Пример #7
0
int main(int argc, char *argv[])
{
    QCoreApplication a(argc, argv);
	QStringList args = a.arguments();
	if (args.size() < 3) {
		QStringList usage;
		usage << args.at(0)
			  << "[train data]"
			  << "[test data]";
		qFatal("Too few arguments. Usage:\n%s\n", usage.join(" ").toStdString().c_str());
	}

	QFile trainFile(args.at(1));
	if (!trainFile.open(QIODevice::ReadOnly)) {
		qFatal("Failed to open train file %s.\n", trainFile.fileName().toStdString().c_str());
	}
	QFile testFile(args.at(2));
	if (!testFile.open(QIODevice::ReadOnly)) {
		qFatal("Failed to open test file %s.\n", testFile.fileName().toStdString().c_str());
	}

    QElapsedTimer loadTimer;
    loadTimer.start();
    FeatureImporter trainFeatures;
    FeatureImporter testFeatures;
#pragma omp sections
    {
#pragma omp section
        {
            trainFeatures.open(&trainFile);
        }
#pragma omp section
        {
            testFeatures.open(&testFile);
        }
    }
    int loadMsecs = loadTimer.elapsed();
    qDebug() << "loading took" << loadMsecs << "msecs";

	trainFile.close();
	testFile.close();

	QVector<QString> hash;
    QVector<qint8> trainClasses;
	for (int i = 0; i < trainFeatures.labels().size(); i++) {
        qint8 index = hash.indexOf(trainFeatures.labels().at(i));
		if (index == -1) {
			QString dbg("Appending label \"%1\" to hash at position %2. It has now value \"%3\"");
			hash.append(trainFeatures.labels().at(i));
			index = hash.size() - 1;
			//qDebug() << dbg.arg(trainFeatures.labels().at(i), QString::number(index), hash.at(index));
		}
		trainClasses.append(index);
	}

    ClassifierInterface *ci = new CpuClassifier();
    QVector<QVector<int> > classes;
    qDebug() << "starting classification";
    QList<int> k;

    bool ok = true;
    int i = 50;
    if (args.size() >= 4) {
        i = qMax(0, args.at(3).toInt(&ok));
    } else {
        ok = false;
    }
    if (!ok) {
        qDebug() << "no k given, assuming k = 50";
        i = 50;
    }
    qDebug() << "initial k:" << i;
    for (; i >= 1; i--) {
        k.append(i);
    }
    QElapsedTimer timer;
    timer.start();
	classes = ci->classify(trainFeatures.features(), testFeatures.features(),
						   trainClasses.constData(), NULL,
						   testFeatures.featuresPerItem(),
						   trainFeatures.itemCount(), testFeatures.itemCount(),
                           k);
    delete ci;
	int msecs = timer.elapsed();
	qDebug() << "calculations took" << msecs << "msecs";
    for (int w = 0; w < classes.size(); w++) {
        int correct = 0;
        QVector<QVector<qreal> > confusionMatrix;
        confusionMatrix.resize(hash.size());
        for (int i = 0; i < confusionMatrix.size(); i++) {
            confusionMatrix[i].resize(hash.size());
        }
        for (int i = 0; i < classes.at(w).size(); i++) {
            /*qDebug() << i;
            qDebug() << classes.at(i);
            qDebug() << hash.at(classes.at(i));
            qDebug() << testFeatures.labels().at(i);*/
            confusionMatrix[hash.indexOf(testFeatures.labels().at(i))][classes.at(w).at(i)]++;
            /*if (hash.at(classes.at(w).at(i)) == QString("5")) {
                qDebug() << "is 5, should be " << testFeatures.labels().at(i);
            }*/
            if (hash.at(classes.at(w).at(i)) == testFeatures.labels().at(i)) {
                correct++;
            }
        }
        QVector<QPair<QString, int> > sorter;
        for (int i = 0; i < hash.size(); i++) {
            sorter << qMakePair(hash.at(i), i);
        }
        qSort(sorter);
        QStringList l;
        for (int i = 0; i < hash.size(); i++) {
            l << sorter.at(i).first;
        }
        QVector<QVector<qreal> > tempConfusionMatrix;
        tempConfusionMatrix.resize(hash.size());
        for (int j = 0; j < confusionMatrix.size(); j++) {
            for (int i = 0; i < sorter.size(); i++) {
                tempConfusionMatrix[j] << confusionMatrix.at(j).at(sorter.at(i).second);
            }
        }
        confusionMatrix = tempConfusionMatrix;
        for (int j = 0; j < confusionMatrix.size(); j++) {
            tempConfusionMatrix[j] = confusionMatrix.at(sorter.at(j).second);
        }
        confusionMatrix = tempConfusionMatrix;
#ifdef PERCENTAGE_CONFUSION
        for (int i = 0; i < confusionMatrix.size(); i++) {
            qreal sum = 0;
            for (int j = 0; j < confusionMatrix.at(i).size(); j++) {
                sum += confusionMatrix.at(j).at(i);
            }
            for (int j = 0; j < confusionMatrix.at(i).size(); j++) {
                confusionMatrix[j][i] = confusionMatrix.at(j).at(i) / sum * 100.0;
            }
        }
#endif
        QTextStream stream(stdout);
        stream << "k: " << k.at(w) << endl;
        stream << "\t&\t" << l.join("\t&\t") << "\\\\" << endl;
        for (int i = 0; i < confusionMatrix.size(); i++) {
            QStringList list;
            list << sorter.at(i).first;
            for (int j = 0; j < confusionMatrix.size(); j++) {
                list << QString::number(confusionMatrix[i][j], 'g', 4);
            }
            const QString joined(list.join("\t&\t"));
            stream << joined << "\\\\" << endl;
        }
        stream << "correct: " << ((float)correct / (float)classes.at(w).size()) * 100 << "%" << endl;
    }
    msecs = timer.elapsed();
    qDebug() << "everything took" << msecs << "msecs";
	return 0;
}