LanguageModel::LanguageModel(const std::string & filename, ClassEncoder & encoder, ClassDecoder * classdecoder, bool debug) { this->DEBUG = debug; this->classdecoder = classdecoder; order = 0; bool hasunk = false; ifstream f; f.open(filename.c_str(), ios::in); if ((!f) || (!f.good())) { cerr << "File does not exist: " << filename << endl; exit(3); } while (!f.eof()) { string line; getline(f, line); if (line == "\\data\\") { order = 0; } else if (line == "\\1-grams:") { //bit inelegant, but simplest order = 1; } else if (line == "\\2-grams:") { order = 2; } else if (line == "\\3-grams:") { order = 3; } else if (line == "\\4-grams:") { order = 4; } else if (line == "\\5-grams:") { order = 5; } else if (line == "\\6-grams:") { order = 6; } else if (line == "\\7-grams:") { order = 7; } else if (line == "\\8-grams:") { order = 8; } else if (line == "\\9-grams:") { order = 9; } else if (!line.empty()) { if (order == 0) { if (line.substr(0,5) == "ngram") { string n_s = line.substr(6,1); string v_s = line.substr(8); int n = atoi(n_s.c_str()); int v = atoi(v_s.c_str()); total[n] = v; } } else if (order > 0) { string logprob_s = ""; string backofflogprob_s = ""; string ngramcontent = ""; int fields = 0; int begin = 0; for (unsigned int i = 0; i <= line.length(); i++) { if ((line[i] == '\t') || (line[i] == '\n') || (i == line.length())) { if (fields == 0) { logprob_s = line.substr(begin, i - begin); } else if (fields == 1) { ngramcontent = line.substr(begin, i - begin); } else if (fields == 2) { backofflogprob_s = line.substr(begin, i - begin); } begin = i + 1; fields++; } } if ((!logprob_s.empty()) && (!ngramcontent.empty())) { if (ngramcontent == "<unk>") { ngrams[UNKPATTERN] = atof(logprob_s.c_str()) * log(10); //* log(10) does log10 to log_e conversion hasunk = true; if (DEBUG) { cerr << " Adding UNKNOWN to LM: " << (int) UNKPATTERN.n() << "\t" << ngramcontent << "\t" << ngrams[UNKPATTERN] << endl; } } else { Pattern ngram = encoder.buildpattern(ngramcontent); if (ngram != UNKPATTERN) { ngrams[ngram] = atof(logprob_s.c_str()) * log(10); //* log(10) does log10 to log_e conversion if (!backofflogprob_s.empty()) { backoff[ngram] = atof(backofflogprob_s.c_str()) * log(10); //* log(10) does log10 to log_e conversion if (DEBUG) cerr << " Adding to LM: " << (int) ngram.n() << "\t" << ngramcontent << "\t" << ngrams[ngram] << "\t" << backoff[ngram] << endl; } else { if (DEBUG) cerr << " Adding to LM: " << (int) ngram.n() << "\t" << ngramcontent << "\t" << ngrams[ngram] << endl; } } } } else { cerr << "WARNING: Ignoring line: " << line << endl; } } else { cerr << "WARNING: Don't know what to do with line: " << line << endl; } } } f.close(); if (!hasunk) { cerr << "ERROR: Language Model has no value <unk>, make sure to generate SRILM model with -unk parameter" << endl; exit(3); } }
void loadmosesphrasetable(PatternAlignmentModel<double> & model, const std::string & filename, ClassEncoder & sourceencoder, ClassEncoder & targetencoder, PatternSetModel * constrainsourcemodel = NULL, PatternSetModel * constraintargetmodel = NULL, int max_sourcen =0, const double pts=0, const double pst=0, const double joinedthreshold=0, const double divergencefrombestthreshold=0.0, const std::string delimiter = "|||", const int score_column=3, const int pstfield = 0, const int ptsfield=2, const int maxscores = 10) { unsigned int added = 0; unsigned int skipped = 0; unsigned int constrained = 0; unsigned int count = 0; PatternSetModel firstwords; if (constrainsourcemodel != NULL) { cerr << "(Inferring extra contraints from source model, for faster discarding of patterns)" << endl; for (PatternSetModel::iterator iter = constrainsourcemodel->begin(); iter != constrainsourcemodel->end(); iter++) { const Pattern pattern = *iter; const Pattern firstword = pattern[0]; firstwords.insert(firstword); } cerr << "(added " << firstwords.size() << " unigrams)" << endl; } //load from moses-style phrasetable file istream * f = NULL; if (filename.substr(filename.size()-3) == ".gz") { cerr << "(Reading from gzip)" << endl; f = new igzstream(filename.c_str(), ios::in | ios::binary); } else { f = new ifstream(filename.c_str(), ios::in | ios::binary); } if ((f == NULL) || (!f->good())) { cerr << "File does not exist: " << filename << endl; exit(2); } vector<BufferItem> buffer; string firstword; bool skipsamesource = false; string prevsource; string skipfirstword; string source = ""; string target = ""; string scores_s; bool abort = false; int mode = 0; int begin = 0; bool updated = false; string line; vector<double> scores; while (!f->eof()) { line.clear(); getline(*f, line); count++; if (count % 100000 == 0) { cerr << endl; cerr << "Loading and encoding phrase-table: @" << count << " total added: " << added << ", skipped because of threshold: " << skipped << ", skipped because of constraints: " << constrained; } if (count % 1000 == 0) { if (updated) { cerr << ":"; } else { cerr << "."; } updated = false; } mode = 0; abort = false; begin = 0; source.clear(); target.clear(); scores_s.clear(); const int linesize = line.size(); if (linesize == 0) continue; for (unsigned int i = 0; i < linesize; i++) { if (line.substr(i,5) == " ||| ") { if (mode == 0) { source = line.substr(begin, i - begin); int j = 0; firstword = source; for (auto c : source) { if (c == ' ') { firstword = source.substr(0,j); break; } j++; } if (firstword == skipfirstword) { abort = true; break; } } else if (mode == 1) { target = line.substr(begin, i - begin); } else if (mode == 2) { scores_s = line.substr(begin, i - begin); } begin = i+5; mode++; } } if (mode == 2) { scores_s = line.substr(begin); mode++; } if ((abort) || (firstword == skipfirstword)) { constrained++; continue; } else if ((skipsamesource) && (source == prevsource)) { constrained++; continue; } else { skipsamesource = false; skipfirstword = ""; } if (mode < 3) { cerr << endl << "WARNING: Error in input format, line " << count << " (length=" << linesize << "): " << endl; cerr << line << endl; cerr << "SKIPPING..." << endl; } scores.clear(); scores_s = scores_s + " "; begin = 0; //cerr << "DEBUG: scores_s=" << scores_s << endl; for (unsigned int i = 0; i < scores_s.size(); i++) { if ((scores_s[i] == ' ') && (i > begin)) { double score = atof(scores_s.substr(begin, i - begin).c_str()); //cerr << scores_s.substr(begin, i - begin) << " -> " << score << endl; scores.push_back(score); begin = i + 1; } } if (((!buffer.empty()) && (source != prevsource))) { if (buffer.size() >= 100) { cerr << "!"; } if (divergencefrombestthreshold > 0) { double bestscore = 0; for (auto bufferitem : buffer) { if (bufferitem.scores[ptsfield] > bestscore) bestscore = bufferitem.scores[ptsfield]; } double p = bestscore * divergencefrombestthreshold; for (auto bufferitem : buffer) { if (bufferitem.scores[ptsfield] >= p) { model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores, false ); added++; updated = true; } else { skipped++; } } } else { for (auto bufferitem : buffer) { model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores, false ); added++; updated = true; } } buffer.clear(); } //check score threshold if ( ((pst > 0) && (scores[pstfield] < pst)) || ((pts > 0) && (scores[ptsfield] < pts)) || ((joinedthreshold > 0) && (scores[ptsfield] * scores[pstfield] < joinedthreshold)) ) { skipped++; } else { //add to phrasetable try{ Pattern sourcepattern = sourceencoder.buildpattern(source); if ((constrainsourcemodel != NULL) && (!constrainsourcemodel->has(sourcepattern))) { const int _n = sourcepattern.n(); if (_n == 1) { skipfirstword = firstword; } else if (!firstwords.has(sourcepattern[0])) { skipfirstword = firstword; } constrained++; skipsamesource = true; continue; } if ((max_sourcen > 0) && (sourcepattern.n() > max_sourcen)) { skipped++; skipsamesource = true; continue; } Pattern targetpattern = targetencoder.buildpattern(target); if ((constraintargetmodel != NULL) && (!constraintargetmodel->has(targetpattern))) { constrained++; continue; } if (scores.size() > maxscores) { cerr << endl << "*** WARNING: Unexpectedly large number of scores in line " << count << ", something wrong? ***" << endl; } buffer.push_back( BufferItem(sourcepattern, targetpattern, scores) ); } catch (const UnknownTokenError &e) { cerr << endl << "*** WARNING: UnknownTokenError in encoding of source or target fragment on line " << count << " skipping: " << source << " ||| " << target <<endl; } } } //don't forget last one in buffer: if (!buffer.empty()) { if (divergencefrombestthreshold > 0) { double bestscore = 0; for (auto bufferitem : buffer) if (bufferitem.scores[ptsfield] > bestscore) bestscore = bufferitem.scores[ptsfield]; double p = bestscore * divergencefrombestthreshold; for (auto bufferitem : buffer) { if (bufferitem.scores[ptsfield] >= p) { model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores, false ); added++; } else { skipped++; } } } else { for (auto bufferitem : buffer) { model.add( bufferitem.sourcepattern, bufferitem.targetpattern, bufferitem.scores,false ); added++; } } buffer.clear(); } cerr << "Read " << count << " lines" << endl; cerr << "Added: " << added << " -- skipped due to threshold: " << skipped << " -- skipped by constraint: " << constrained << endl; cerr << "Source patterns: " << model.size() << endl; }