void mmEM::readFileXY(param myParam, string filename, vector_2str *wordX, vector_2str *wordY) { cout << "Reading file: " << filename << endl; ifstream INPUTFILE; INPUTFILE.open(filename.c_str()); if (! INPUTFILE) { cerr << "error: unable to open file " << filename << endl; exit(-1); } while (! INPUTFILE.eof() ) { string line; vector<string> lineList; // read each line and split column by space // getline(INPUTFILE, line); // ignore empty line if (line == "") { continue; } // split by tab to get source and target // if (myParam.inFormat == "l2p") { lineList = splitBySpace(line); vector<string> t0,t1; Tokenize(lineList[0], t0, ""); // lhuang //Tokenize(lineList[1], t1, ""); wordX->push_back(t0); wordY->push_back(splitBySpace(lineList[1])); } else if (myParam.inFormat == "news") { Tokenize(line, lineList, "\t"); // lhuang: no space on source (letters) vector<string> t0; Tokenize(lineList[0], t0, ""); wordX->push_back(t0); wordY->push_back(splitBySpace(lineList[1])); } else { cerr << "ERROR: can't find input format type, plz. check --inFormat" << endl << endl; } if (lineList.size() != 2) { cerr << "Warning : missing either x or y word here, so skipped:" << endl << line << endl; } } // close file // INPUTFILE.close(); }
void readSentFile(const string &file, vector<vector<intern_string> > &sentences) { cerr << "Reading sentences from: " << file << "..."; ifstream TRAININ; TRAININ.open(file.c_str()); if (! TRAININ) { cerr << "Error: can't read from file " << file << endl; exit(-1); } string line; while (getline(TRAININ, line)) { vector<string> words; splitBySpace(line, words); vector<intern_string> intern_words(words.begin(), words.end()); sentences.push_back(intern_words); if (sentences.size() % 1000000 == 0) cerr << sentences.size() << "..."; } cerr << endl; TRAININ.close(); }
void m_dccchat::onUserCTCP(std::string server, std::string nick, std::string message) { std::vector<std::string> messageParts = splitBySpace(message); if (messageParts[0] == "DCC" && messageParts[1] == "CHAT" && messageParts[2] == "chat") { if (activeConnections.find(server + "/" + nick) == activeConnections.end()) dccConnect(server, nick, messageParts[3], messageParts[4]); else sendNotice(server, nick, "You already have an active DCC chat session!"); } }
void mmEM::readInitFile(param myParam) { // if there is an initial mapping file // // read the initFile, fill up the limit set // if (myParam.initFile != "") { cout << "Reading the initial file: " << myParam.initFile << endl; ifstream INITFILE; INITFILE.open(myParam.initFile.c_str()); if (! INITFILE) { cerr << "error: unable to open file " << myParam.initFile << endl; exit(-1); } vector_initTable initCount; while (! INITFILE.eof() ) { string line; vector<string> lineList; getline(INITFILE, line); initTable initTmp; if (line == "") { continue; } //string t0,t1,t3; //if (myParam.inFormat == "l2p") //{ // should read it as the model format // lineList = splitBySpace(line); initTmp.xstring = lineList[0]; initTmp.ystring = lineList[1]; limitSet.insert(initTmp.xstring + "|" + initTmp.ystring); if (lineList.size() > 2) { initTmp.prob = (long double)atof(lineList[2].c_str()); } else { initTmp.prob = 1; } initCount.push_back(initTmp); } INITFILE.close(); } }
void readWeightsFile(ifstream &TRAININ, vector<float> &weights) { string line; while (getline(TRAININ, line) && line != "") { vector<string> items; splitBySpace(line, items); if (items.size() != 1) { cerr << "Error: weights file should have only one weight per line" << endl; exit(-1); } weights.push_back(boost::lexical_cast<float>(items[0])); } }
void readWordsFile(ifstream &TRAININ, vector<string> &word_list) { string line; while (getline(TRAININ, line) && line != "") { vector<string> words; splitBySpace(line, words); if (words.size() != 1) { cerr << "Error: vocabulary file must have only one word per line" << endl; exit(-1); } word_list.push_back(words[0]); } }
// Read a data file of unknown size into a flat vector<int>. // If this takes too much memory, we should create a vector of minibatches. void readSentFile(const string &filename, vector<vector <int> > &data, int minibatch_size, data_size_t &num_tokens) { cerr << "Reading input sentences from file " << filename << ": "; ifstream DATAIN(filename.c_str()); if (!DATAIN) { cerr << "Error: can't read data from file " << filename<< endl; exit(-1); } vector<int> data_vector; string line; long long int n_lines = 0; while (getline(DATAIN, line)) { vector<string> ngram; splitBySpace(line, ngram); /* if (ngram_size == 0) ngram_size = ngram.size(); if (ngram.size() != ngram_size) { cerr << "Error: expected " << ngram_size << " fields in instance, found " << ngram.size() << endl; exit(-1); } */ vector<int> int_ngram; for (int i=0;i<ngram.size();i++) int_ngram.push_back(boost::lexical_cast<int>(ngram[i])); data.push_back(int_ngram); num_tokens += int_ngram.size(); n_lines++; if (minibatch_size && n_lines % (minibatch_size * 10000) == 0) cerr << n_lines/minibatch_size << "..."; } cerr << "done." << endl; DATAIN.close(); }
void model::readConfig(ifstream &config_file) { string line; vector<string> fields; int ngram_size, vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension; activation_function_type activation_function = this->activation_function; while (getline(config_file, line) && line != "") { splitBySpace(line, fields); if (fields[0] == "ngram_size") ngram_size = lexical_cast<int>(fields[1]); else if (fields[0] == "vocab_size") input_vocab_size = output_vocab_size = lexical_cast<int>(fields[1]); else if (fields[0] == "input_vocab_size") input_vocab_size = lexical_cast<int>(fields[1]); else if (fields[0] == "output_vocab_size") output_vocab_size = lexical_cast<int>(fields[1]); else if (fields[0] == "input_embedding_dimension") input_embedding_dimension = lexical_cast<int>(fields[1]); else if (fields[0] == "num_hidden") num_hidden = lexical_cast<int>(fields[1]); else if (fields[0] == "output_embedding_dimension") output_embedding_dimension = lexical_cast<int>(fields[1]); else if (fields[0] == "activation_function") activation_function = string_to_activation_function(fields[1]); else if (fields[0] == "version") { int version = lexical_cast<int>(fields[1]); if (version != 1) { cerr << "error: file format mismatch (expected 1, found " << version << ")" << endl; exit(1); } } else cerr << "warning: unrecognized field in config: " << fields[0] << endl; } resize(ngram_size, input_vocab_size, output_vocab_size, input_embedding_dimension, num_hidden, output_embedding_dimension); set_activation_function(activation_function); }
void mmEM::readAlignerFromFile(param myParam) { cout << "Reading aligner model from file : " << myParam.alignerIn << endl; //clear model parameters probs.clear(); counts.clear(); ifstream ALIGNERIN; ALIGNERIN.open(myParam.alignerIn.c_str()); if (! ALIGNERIN) { cerr << "Error : can't read aligner model from file " << myParam.alignerIn << endl; exit(-1); } while (! ALIGNERIN.eof()) { string line; vector<string> lineList; getline(ALIGNERIN, line); if (line == "") { continue; } lineList = splitBySpace(line); if (lineList.size() != 3) { cerr << "Error : aligner model is in the wrong format " << endl << line << endl; exit(-1); } // problem with long double when reading from string // //probs[lineList[0]][lineList[1]] = convertTo<long double>(lineList[2]); probs[lineList[0]][lineList[1]] = (long double)atof(lineList[2].c_str()); } ALIGNERIN.close(); }
void allPhonemeSet::addFromFile(string filename, bool limitCandidate) { string line; vector<string> lineList; if (limitCandidate) { ifstream FILEIN; FILEIN.open(filename.c_str()); while (! FILEIN.eof()) { getline(FILEIN,line); lineList = splitBySpace(line); if (lineList.size() > 1) { addPhoneme(lineList[1],lineList[0],limitCandidate); } } FILEIN.close(); } }
void mmEM::initialization(param myParam, vector_2str stringX, vector_2str stringY) { if (myParam.limitPair) { readInitFile(myParam); } if (stringX.size() != stringY.size()) { cerr << "error: data are not in pairs of x and y " << endl; cerr << "# of x instances : " << stringX.size() << endl; cerr << "# of y instances : " << stringY.size() << endl; exit(-1); } // initialization with uniform distribution all possible alignments // long double totalCount = 0; // keep track how many observations // for each x and y pair // for (int i=0; i < stringX.size(); i++) { // over lengths of x and y for (int xl = 0; xl < stringX[i].size(); xl++) { for (int yl = 0; yl < stringY[i].size(); yl++) { if (myParam.delX) { for (int j=0; (j < myParam.maxX) && (xl-j >= 0); j++) { //string ssX = stringX[i].substr(xl-j,j+1); string ssX = join(stringX[i], xl-j , j+1); counts[ssX][myParam.nullChar] = 1; } } if (myParam.delY) { for (int k=0; (k < myParam.maxY) && (yl-k >=0); k++) { // string ssY = stringY[i].substr(yl-k,k+1); string ssY = join(stringY[i], yl-k, k+1); counts[myParam.nullChar][ssY] = 1; } } for (int j = 0; (j < myParam.maxX) && (xl-j >= 0); j++) { for (int k=0; (k < myParam.maxY) && (yl-k >=0); k++) { //string ssX = stringX[i].substr(xl-j,j+1); //string ssY = stringY[i].substr(yl-k,k+1); string ssX = join(stringX[i], xl-j, j+1); string ssY = join(stringY[i], yl-k, k+1); // if it defines a limit set // if (myParam.limitPair) { if (limitSet.find(ssX + myParam.sepChar + ssY) == limitSet.end()) { continue; } } counts[ssX][ssY] = 1; } } } } } // if there is an initial mapping file // // while reading the initFile, fill up the limit set // if (myParam.initFile != "") { cout << "Reading the initial file: " << myParam.initFile << endl; ifstream INITFILE; INITFILE.open(myParam.initFile.c_str()); if (! INITFILE) { cerr << "error: unable to open file " << myParam.initFile << endl; exit(-1); } vector_initTable initCount; while (! INITFILE.eof() ) { string line; vector<string> lineList; getline(INITFILE, line); initTable initTmp; if (line == "") { continue; } //string t0,t1,t3; //if (myParam.inFormat == "l2p") //{ // should read it as the model format // lineList = splitBySpace(line); initTmp.xstring = lineList[0]; initTmp.ystring = lineList[1]; if (lineList.size() > 2) { initTmp.prob = (long double)atof(lineList[2].c_str()); } else { initTmp.prob = 1; } initCount.push_back(initTmp); } INITFILE.close(); // sort initCount // cout << "Sorting the initial count table" << endl; sort(initCount.begin(), initCount.end(), initTableSortedFn); long double total_add_prob = 0; for (vector_initTable::iterator pos = initCount.begin(); pos != initCount.end(); pos++) { if ((total_add_prob < myParam.initProbCut) || (pos->prob == 1)) { counts[pos->xstring][pos->ystring] += ((counts[pos->xstring].size() * 10) + stringX.size()) * pos->prob ; } else { break; } if (pos->prob != 1) { total_add_prob += pos->prob; } } } }