Beispiel #1
0
void mmEM::readFileXY(param myParam, string filename, vector_2str *wordX, vector_2str *wordY)
{
	cout << "Reading file: " << filename << endl;
	ifstream INPUTFILE;
	INPUTFILE.open(filename.c_str());
	if (! INPUTFILE)
	{
		cerr << "error: unable to open file " << filename << endl;
		exit(-1);
	}
	while (! INPUTFILE.eof() )
	{
		string line;
		vector<string> lineList;
		
		// read each line and split column by space //
		getline(INPUTFILE, line);
		// ignore empty line
		if (line == "")
		{
			continue;
		}
		// split by tab to get source and target //
		if (myParam.inFormat == "l2p")
		{
			lineList = splitBySpace(line);
			vector<string> t0,t1;
			Tokenize(lineList[0], t0, "");
			// lhuang
			//Tokenize(lineList[1], t1, "");

			wordX->push_back(t0);
			wordY->push_back(splitBySpace(lineList[1]));
		}
		else if (myParam.inFormat == "news")
		{
			Tokenize(line, lineList, "\t");
			// lhuang: no space on source (letters)
			vector<string> t0;
			Tokenize(lineList[0], t0, "");			
			wordX->push_back(t0);
			wordY->push_back(splitBySpace(lineList[1]));
		}
		else
		{
			cerr << "ERROR: can't find input format type, plz. check --inFormat" << endl << endl;
		}

		if (lineList.size() != 2)
		{
			cerr << "Warning : missing either x or y word here, so skipped:" << endl << line << endl;
		}
	}
	// close file //
	INPUTFILE.close();
}
Beispiel #2
0
  void readSentFile(const string &file, vector<vector<intern_string> > &sentences)
  {
    cerr << "Reading sentences from: " << file << "...";

    ifstream TRAININ;
    TRAININ.open(file.c_str());
    if (! TRAININ)
      {
	cerr << "Error: can't read from file " << file << endl;
	exit(-1);
      }

    string line;
    while (getline(TRAININ, line))
      {
	vector<string> words;
	splitBySpace(line, words);
	vector<intern_string> intern_words(words.begin(), words.end());
	sentences.push_back(intern_words);
	if (sentences.size() % 1000000 == 0)
	  cerr << sentences.size() << "...";
      }
    cerr << endl;

    TRAININ.close();
  }
Beispiel #3
0
void m_dccchat::onUserCTCP(std::string server, std::string nick, std::string message) {
	std::vector<std::string> messageParts = splitBySpace(message);
	if (messageParts[0] == "DCC" && messageParts[1] == "CHAT" && messageParts[2] == "chat") {
		if (activeConnections.find(server + "/" + nick) == activeConnections.end())
			dccConnect(server, nick, messageParts[3], messageParts[4]);
		else
			sendNotice(server, nick, "You already have an active DCC chat session!");
	}
}
Beispiel #4
0
void mmEM::readInitFile(param myParam)
{
	// if there is an initial mapping file //
	// read the initFile, fill up the limit set //
	if (myParam.initFile != "")
	{
		cout << "Reading the initial file: " << myParam.initFile << endl;
		ifstream INITFILE;
		INITFILE.open(myParam.initFile.c_str());
		if (! INITFILE)
		{
			cerr << "error: unable to open file " << myParam.initFile << endl;
			exit(-1);
		}

		vector_initTable initCount;
		while (! INITFILE.eof() )
		{
			string line;
			vector<string> lineList;

			getline(INITFILE, line);

			initTable initTmp;

			if (line == "")
			{
				continue;
			}
			//string t0,t1,t3;
			//if (myParam.inFormat == "l2p")
			//{
			// should read it as the model format //

			lineList = splitBySpace(line);
			
			initTmp.xstring = lineList[0];
			initTmp.ystring = lineList[1];

			limitSet.insert(initTmp.xstring + "|" + initTmp.ystring);
			
			if (lineList.size() > 2)
			{
				initTmp.prob = (long double)atof(lineList[2].c_str());
			}
			else
			{
				initTmp.prob = 1;
			}
			initCount.push_back(initTmp);
		}
		INITFILE.close();
	}
}
Beispiel #5
0
void readWeightsFile(ifstream &TRAININ, vector<float> &weights) {
  string line;
  while (getline(TRAININ, line) && line != "")
  {
    vector<string> items;
    splitBySpace(line, items);
    if (items.size() != 1)
    {
        cerr << "Error: weights file should have only one weight per line" << endl;
        exit(-1);
    }
    weights.push_back(boost::lexical_cast<float>(items[0]));
  }
}
Beispiel #6
0
  void readWordsFile(ifstream &TRAININ, vector<string> &word_list)
  {
    string line;
    while (getline(TRAININ, line) && line != "")
      {
	vector<string> words;
	splitBySpace(line, words);
	if (words.size() != 1)
	  {
	    cerr << "Error: vocabulary file must have only one word per line" << endl;
	    exit(-1);
	  }
	word_list.push_back(words[0]);
      }
  }
Beispiel #7
0
// Read a data file of unknown size into a flat vector<int>.
// If this takes too much memory, we should create a vector of minibatches.
void readSentFile(const string &filename, 
				vector<vector <int> > &data, 
				int minibatch_size,
				data_size_t &num_tokens)
{
  cerr << "Reading input sentences from file " << filename << ": ";

  ifstream DATAIN(filename.c_str());
  if (!DATAIN)
  {
    cerr << "Error: can't read data from file " << filename<< endl;
    exit(-1);
  }

  vector<int> data_vector;

  string line;
  long long int n_lines = 0;
  while (getline(DATAIN, line))
  {
    vector<string> ngram;
    splitBySpace(line, ngram);
	
	/*
    if (ngram_size == 0)
        ngram_size = ngram.size();

    if (ngram.size() != ngram_size)
    {
        cerr << "Error: expected " << ngram_size << " fields in instance, found " << ngram.size() << endl;
	exit(-1);
    }
	*/
	vector<int> int_ngram;
    for (int i=0;i<ngram.size();i++)
        int_ngram.push_back(boost::lexical_cast<int>(ngram[i]));

	data.push_back(int_ngram);
	num_tokens += int_ngram.size();
	
    n_lines++;
    if (minibatch_size && n_lines % (minibatch_size * 10000) == 0)
      cerr << n_lines/minibatch_size << "...";
  }
  cerr << "done." << endl;
  DATAIN.close();
}
Beispiel #8
0
void model::readConfig(ifstream &config_file)
{
    string line;
    vector<string> fields;
    int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension;
    activation_function_type activation_function = this->activation_function;
    while (getline(config_file, line) && line != "")
    {
        splitBySpace(line, fields);
	if (fields[0] == "ngram_size")
	    ngram_size = lexical_cast<int>(fields[1]);
	else if (fields[0] == "vocab_size")
	    input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]);
	else if (fields[0] == "input_vocab_size")
	    input_vocab_size = lexical_cast<int>(fields[1]);
	else if (fields[0] == "output_vocab_size")
	    output_vocab_size = lexical_cast<int>(fields[1]);
	else if (fields[0] == "input_embedding_dimension")
	    input_embedding_dimension = lexical_cast<int>(fields[1]);
	else if (fields[0] == "num_hidden")
	    num_hidden = lexical_cast<int>(fields[1]);
	else if (fields[0] == "output_embedding_dimension")
	    output_embedding_dimension = lexical_cast<int>(fields[1]);
	else if (fields[0] == "activation_function")
	    activation_function = string_to_activation_function(fields[1]);
	else if (fields[0] == "version")
	{
	    int version = lexical_cast<int>(fields[1]);
	    if (version != 1)
	    {
		cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl;
		exit(1);
	    }
	}
	else
	    cerr << "warning: unrecognized field in config: " << fields[0] << endl;
    }
    resize(ngram_size,
        input_vocab_size,
        output_vocab_size,
        input_embedding_dimension,
        num_hidden,
        output_embedding_dimension);
    set_activation_function(activation_function);
}
Beispiel #9
0
void mmEM::readAlignerFromFile(param myParam)
{
	cout << "Reading aligner model from file : " << myParam.alignerIn << endl;

	//clear model parameters 
	probs.clear();
	counts.clear();

	ifstream ALIGNERIN;
	ALIGNERIN.open(myParam.alignerIn.c_str());
	if (! ALIGNERIN)
	{
		cerr << "Error : can't read aligner model from file " << myParam.alignerIn << endl;
		exit(-1);
	}

	while (! ALIGNERIN.eof())
	{
		string line;
		vector<string> lineList;

		getline(ALIGNERIN, line);
		if (line == "")
		{
			continue;
		}

		lineList = splitBySpace(line);

		if (lineList.size() != 3)
		{
			cerr << "Error : aligner model is in the wrong format " << endl << line << endl;
			exit(-1);
		}

		// problem with long double when reading from string // 
		//probs[lineList[0]][lineList[1]] = convertTo<long double>(lineList[2]);
		probs[lineList[0]][lineList[1]] = (long double)atof(lineList[2].c_str());
	}
	ALIGNERIN.close();
}
void allPhonemeSet::addFromFile(string filename, bool limitCandidate)
{
	string line;
	vector<string> lineList;
	if (limitCandidate)
	{
		ifstream FILEIN;
		FILEIN.open(filename.c_str());

		while (! FILEIN.eof())
		{
			getline(FILEIN,line);
			lineList = splitBySpace(line);

			if (lineList.size() > 1)
			{
				addPhoneme(lineList[1],lineList[0],limitCandidate);
			}
		}
		FILEIN.close();
	}
}
Beispiel #11
0
void mmEM::initialization(param myParam, vector_2str stringX, vector_2str stringY)
{

	if (myParam.limitPair)
	{
		readInitFile(myParam);
	}

	if (stringX.size() != stringY.size())
	{
		cerr << "error: data are not in pairs of x and y " << endl;
		cerr << "# of x instances : " << stringX.size() << endl;
		cerr << "# of y instances : " << stringY.size() << endl;
		exit(-1);
	}

	// initialization with uniform distribution all possible alignments //
	
	long double totalCount = 0; // keep track how many observations

	// for each x and y pair //
	for (int i=0; i < stringX.size(); i++)
	{
		// over lengths of x and y
		for (int xl = 0; xl < stringX[i].size(); xl++)
		{
			for (int yl = 0; yl < stringY[i].size(); yl++)
			{
				if (myParam.delX)
				{
					for (int j=0; (j < myParam.maxX) && (xl-j >= 0); j++)
					{
						//string ssX = stringX[i].substr(xl-j,j+1);
						string ssX = join(stringX[i], xl-j , j+1);
						counts[ssX][myParam.nullChar] = 1;
					}
				}

				if (myParam.delY)
				{
					for (int k=0; (k < myParam.maxY) && (yl-k >=0); k++)
					{
						// string ssY = stringY[i].substr(yl-k,k+1);
						string ssY = join(stringY[i], yl-k, k+1);
						counts[myParam.nullChar][ssY] = 1;
					}
				}

				for (int j = 0; (j < myParam.maxX) && (xl-j >= 0); j++)
				{
					for (int k=0; (k < myParam.maxY) && (yl-k >=0); k++)
					{
						//string ssX = stringX[i].substr(xl-j,j+1);
						//string ssY = stringY[i].substr(yl-k,k+1);
						string ssX = join(stringX[i], xl-j, j+1);
						string ssY = join(stringY[i], yl-k, k+1);
						
						// if it defines a limit set //
						if (myParam.limitPair)
						{
							if (limitSet.find(ssX + myParam.sepChar + ssY) == limitSet.end())
							{
								continue;
							}
						}
						counts[ssX][ssY] = 1;
					}
				}
			}
		}
	}

	// if there is an initial mapping file //
	// while reading the initFile, fill up the limit set //
	if (myParam.initFile != "")
	{
		cout << "Reading the initial file: " << myParam.initFile << endl;
		ifstream INITFILE;
		INITFILE.open(myParam.initFile.c_str());
		if (! INITFILE)
		{
			cerr << "error: unable to open file " << myParam.initFile << endl;
			exit(-1);
		}

		vector_initTable initCount;
		while (! INITFILE.eof() )
		{
			string line;
			vector<string> lineList;

			getline(INITFILE, line);

			initTable initTmp;

			if (line == "")
			{
				continue;
			}
			//string t0,t1,t3;
			//if (myParam.inFormat == "l2p")
			//{
			// should read it as the model format //

			lineList = splitBySpace(line);
			
			initTmp.xstring = lineList[0];
			initTmp.ystring = lineList[1];
			
			if (lineList.size() > 2)
			{
				initTmp.prob = (long double)atof(lineList[2].c_str());
			}
			else
			{
				initTmp.prob = 1;
			}
			initCount.push_back(initTmp);
		}
		INITFILE.close();

		// sort initCount //
		cout << "Sorting the initial count table" << endl;
		sort(initCount.begin(), initCount.end(), initTableSortedFn);
		
		long double total_add_prob = 0;
		for (vector_initTable::iterator pos = initCount.begin(); pos != initCount.end(); pos++)
		{
			if ((total_add_prob < myParam.initProbCut) || (pos->prob ==	1))
			{
				counts[pos->xstring][pos->ystring] += ((counts[pos->xstring].size() * 10) + stringX.size()) * pos->prob ;
			}
			else
			{
				break;
			}

			if (pos->prob != 1)
			{
				total_add_prob += pos->prob;
			}
		}
	}

}