void Eval::doBP() { XenOption* opt = XenOption::getInstance(); double lower = 999999999999; int lBound = 0; int hBound = 0; for(EvalMap::iterator it = ptrDist->begin() ; it != ptrDist->end() ; ++it) if (it->second < lower) { lower = it->second; EvalMap::iterator itH = it; if (it != ptrDist->begin()) --itH; EvalMap::iterator itL = it; if (it != ptrDist->end()) ++itL; lBound = itL->first; hBound = itH->first; if (lBound > it->first) { lBound = 0; } if (hBound < it->first) { hBound = opt->getMaxEvalPC(); } } opt->setStep(opt->getStep() / 2); std::cout << "Requiring eval from " + XenCommon::toString(hBound) + " to " + XenCommon::toString(lBound) + " with step " + XenCommon::toString(opt->getStep()) << std::endl; doEval(hBound, lBound); if (opt->getStep() > 1) { doBP(); } }
XenLMken::XenLMken() { XenOption* opt = XenOption::getInstance(); order = (unsigned int) opt->getOrder(); mempc = opt->getMemPc(); temp = opt->getTemp(); textFile = NULL; lmFile = NULL; pc = 0; }
void Similarity::buildIDVector() { std::cout << "Building in-domain vector." << std::endl; XenOption* opt = XenOption::getInstance(); std::map<std::string, float> meanTFIDF; for (unsigned int i = 0; i < ptrWords->size(); i++) meanTFIDF[ptrWords->operator[](i)] = (float)(ptrIdTfIdf->operator[](ptrWords->operator[](i)) + ptrOodTfIdf->operator[](ptrWords->operator[](i))) / (float)2; std::multimap<float, std::string, std::greater<float> > meanTFIDFByScore = XenCommon::flip_map(meanTFIDF); int count = 0; float idFreqThres = (float)ptrID->getSize() / 300; // Threshold 0,33% of words is the same float oodFreqThres = (float)ptrOOD->getSize() / 300; // Threshold 0,33% of words is the same for (std::multimap<float, std::string, std::greater<float> >::iterator it = meanTFIDFByScore.begin(); count < opt->getVecSize(); ++it) { count++; int idTf = ptrIdTf->operator[](it->second); int oodTf = ptrOodTf->operator[](it->second); if (idTf > idFreqThres || oodTf > oodFreqThres) { count--; } else { ptrIdVecTfIdf->operator[](count - 1) = it->first; ptrVecWords->operator[](count - 1) = it->second; } } /** @todo DEBUG, needs more testing */ std::string outName = ptrID->getXenFile()->getPrefix() + ".vec"; std::ofstream out(outName.c_str(), std::ios::out | std::ios::trunc); for (int i = 0; i < opt->getVecSize(); i++) { out << ptrVecWords->operator[](i) << "\t\t" << ptrIdVecTfIdf->operator[](i) << std::endl; } out.close(); std::cout << "Done building in-domain vector." << std::endl; }
void XenIO::writeNewPT(boost::shared_ptr<PhraseTable> ptrPT, boost::shared_ptr<Score> ptrScore) { XenOption* opt = XenOption::getInstance(); Score sc2; if (opt->getLocal()) { std::vector<SourcePhrase> srcPh = ptrPT->getSrcPhrases(); for (unsigned int i = 0; i < srcPh.size(); i++) { SourcePhrase sP = srcPh[i]; for (unsigned int j = 0; j < sP.getScoresXE()->getSize(); j++) { sc2.addScore(sP.getScoresXE()->getScore(j)); } } } try { std::cout << "Writing new phrase-table to " + opt->getOutName() << std::endl; std::string oF = opt->getOutName(); boost::iostreams::filtering_ostream out; out.push(boost::iostreams::gzip_compressor()); out.push(boost::iostreams::file_sink(oF.c_str(), std::ios_base::out | std::ios_base::binary)); if (!out.good()) throw XenCommon::XenCEption("Can't write to " + opt->getOutName() + ".gz"); for (unsigned int i = 0; i < ptrScore->getSize(); i++) { out << ptrPT->getSource(i) << " ||| " << ptrPT->getTarget(i) << " ||| " << ptrPT->getScores(i) << " " << XenCommon::toString(ptrScore->getScore(i)); if (opt->getLocal()) out << " " << XenCommon::toString(sc2.getScore(i)); out << " ||| " << ptrPT->getAlignment(i) << " ||| " << ptrPT->getCounts(i) << std::endl; if (out.bad()) throw XenCommon::XenCEption("Something went wrong in output stream..."); } out.flush(); out.reset(); } catch (XenCommon::XenCEption &e) { throw; } }
void Eval::doEval(int high, int low) { XenOption* opt = XenOption::getInstance(); StaticData* sD = StaticData::getInstance(); if (opt->getSVocab()->getFileName().compare("") == 0) { sD->getVocabs()->getPtrSourceVoc()->initialize(sD->getSourceCorps()->getPtrInCorp()); } else { sD->getVocabs()->getPtrSourceVoc()->initialize(opt->getSVocab()); } pool threadPool(opt->getThreads()); int pc = low; if (pc == 0) { pc = opt->getStep(); } std::vector<std::string> parts; while (pc <= high) { EvalMap::iterator found = ptrDist->find(pc); if (found == ptrDist->end()) { std::string partName = sD->getXenResult()->getXenFile()->getDirName() + "/" + sD->getXenResult()->getXenFile()->getPrefix() + "-" + XenCommon::toString(pc) + "pc.gz"; XenIO::writeXRpart(sD->getXenResult(), pc, partName); boost::shared_ptr<Corpus> c = boost::make_shared<Corpus>(); c->initialize(partName, "xx"); threadPool.schedule( boost::bind(taskEval, pc, c, sD->getVocabs()->getPtrSourceVoc(), sD->getDevCorp(), ptrDist)); parts.push_back(partName); } pc += opt->getStep(); } threadPool.wait(); for (int i = 0; i < parts.size(); i++) XenIO::delFile(parts[i]); std::cout << "Evaluation done." << std::endl; }
int SimplePPL::launch() { XenOption* opt = XenOption::getInstance(); StaticData* sD = StaticData::getInstance(); sD->getSourceCorps()->getPtrInCorp()->initialize(opt->getInSData(), opt->getSLang()); sD->getSourceCorps()->getPtrOutCorp()->initialize(opt->getOutSData(), opt->getSLang()); if (opt->getSVocab()->getFileName().compare("") == 0) { if (opt->getFullVocab()) sD->getVocabs()->getPtrSourceVoc()->initialize(sD->getSourceCorps()->getPtrInCorp(), sD->getSourceCorps()->getPtrOutCorp()); else sD->getVocabs()->getPtrSourceVoc()->initialize(sD->getSourceCorps()->getPtrInCorp()); } else sD->getVocabs()->getPtrSourceVoc()->initialize(opt->getSVocab()); if (opt->getInSLM()->getFileName().compare("") == 0) { sD->getSourceLMs()->getPtrInLM()->initialize(sD->getSourceCorps()->getPtrInCorp(), sD->getVocabs()->getPtrSourceVoc()); sD->getSourceLMs()->getPtrInLM()->createLM(); sD->getSourceLMs()->getPtrInLM()->writeLM(); } else { sD->getSourceLMs()->getPtrInLM()->initialize(opt->getInSLM(), sD->getVocabs()->getPtrSourceVoc()); sD->getSourceLMs()->getPtrInLM()->loadLM(); } if (!boost::filesystem::exists(sD->getSourceLMs()->getPtrInLM()->getFileName())) { std::cout << "Error: LM file " + sD->getSourceLMs()->getPtrInLM()->getFileName() + " does not exists!" << std::endl; return 1; } sD->getSourcePPLs()->getPtrInPPL()->initialize(sD->getSourceCorps()->getPtrOutCorp(), sD->getSourceLMs()->getPtrInLM()); sD->getSourcePPLs()->getPtrInPPL()->calcPPLCorpus(); if (opt->getWFile()->getFileName().compare("") != 0) sD->getWeightsFile()->initialize(opt->getWFile()); for (unsigned int i = 0; i < sD->getSourcePPLs()->getPtrInPPL()->getSize(); i++) { double res = sD->getSourcePPLs()->getPtrInPPL()->getPPL(i); if (opt->getWFile()->getFileName().compare("") != 0) res = res * sD->getWeightsFile()->getWeight(i); sD->getScHold()->getPtrScores()->addScore(res); } sD->getScHold()->getPtrScores()->calibrate(); if (opt->getInv()) { sD->getScHold()->getPtrScores()->inverse(); } if (opt->getTLang().compare("") == 0) { std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (unclean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) << std::endl; XenIO::cleanCorpusMono(sD->getSourceCorps()->getPtrOutCorp(), sD->getScHold()->getPtrScores()); std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (clean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) << std::endl; XenIO::writeMonoOutput(sD->getSourceCorps()->getPtrOutCorp(), sD->getScHold()->getPtrScores()); } else { boost::shared_ptr<Corpus> ptrOTCorp = boost::make_shared<Corpus>(); ptrOTCorp->initialize(opt->getOutTData(), opt->getTLang()); std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (unclean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) + " NB Target corp (unclean): " + XenCommon::toString(ptrOTCorp->getSize()) << std::endl; XenIO::cleanCorpusBi(sD->getSourceCorps()->getPtrOutCorp(), ptrOTCorp, sD->getScHold()->getPtrScores()); std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (clean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) + " NB Target corp (clean): " + XenCommon::toString(ptrOTCorp->getSize()) << std::endl; XenIO::writeBiOutput(sD->getSourceCorps()->getPtrOutCorp(), ptrOTCorp, sD->getScHold()->getPtrScores()); } return 0; }
void XenIO::writeMonoOutput(boost::shared_ptr<Corpus> ptrCorp, boost::shared_ptr<Score> ptrScore) { XenOption* opt = XenOption::getInstance(); std::string scoredName = opt->getOutName() + ".scored.gz"; std::string sortedName = opt->getOutName() + ".sorted.gz"; std::multimap<double, std::string> sortMap; for (unsigned int i = 0; i < ptrCorp->getSize(); i++) if (ptrCorp->getPrint(i) && ptrScore->getPrint(i)) { std::pair<double, std::string> p(ptrScore->getScore(i), ptrCorp->getLine(i)); sortMap.insert(p); } try { if (!opt->getSortOnly()) { std::cout << "Writing scored output to " + scoredName << std::endl; boost::iostreams::filtering_ostream out; out.push(boost::iostreams::gzip_compressor()); out.push(boost::iostreams::file_sink(scoredName.c_str(), std::ios_base::out | std::ios_base::binary)); out.setf(std::ios::fixed | std::ios::showpoint); out.precision(15); if (!out.good()) throw XenCommon::XenCEption("Something went wrong in output stream..."); for (unsigned int i = 0; i < ptrCorp->getSize(); i++) { if (ptrCorp->getPrint(i) && ptrScore->getPrint(i)) out << XenCommon::toString(ptrScore->getScore(i)) << '\t' << ptrCorp->getLine(i) << std::endl; if (out.bad()) throw XenCommon::XenCEption("Something went wrong in output stream..."); } out.flush(); out.reset(); } std::cout << "Writing sorted output to " + sortedName << std::endl; boost::iostreams::filtering_ostream out; out.push(boost::iostreams::gzip_compressor()); out.push(boost::iostreams::file_sink(sortedName.c_str(), std::ios_base::out | std::ios_base::binary)); out.setf(std::ios::fixed | std::ios::showpoint); out.precision(15); if (!out.good()) throw XenCommon::XenCEption("Something went wrong in output stream..."); if (opt->getRev()) { for (std::multimap<double, std::string>::reverse_iterator it = sortMap.rbegin(); it != sortMap.rend(); ++it) { out << XenCommon::toString(it->first) << '\t' << it->second << std::endl; if (out.bad()) throw XenCommon::XenCEption("Something went wrong in output stream..."); } } else { for (std::multimap<double, std::string>::iterator it = sortMap.begin(); it != sortMap.end(); ++it) { out << XenCommon::toString(it->first) << '\t' << it->second << std::endl; if (out.bad()) throw XenCommon::XenCEption("Something went wrong in output stream..."); } } out.flush(); out.reset(); } catch (XenCommon::XenCEption &e) { throw; } }
int main(int argc, char* argv[]) { po::options_description desc("XenC options", 200); Options opt; try { desc.add_options() ("source,s", po::value<std::string>(&opt.sLang)->required(), "source language (fr, en, ...)") ("target,t", po::value<std::string>(&opt.tLang)->default_value(""), "target language (if relevant)") ("in-stext,i", po::value<std::string>(&opt.inSData)->required(), "in-domain source text filename (plain text or gzipped file)") ("out-stext,o", po::value<std::string>(&opt.outSData)->required(), "out-of-domain source text filename (plain text or gzipped file)") ("mode,m", po::value<int>(&opt.mode)->required()->default_value(2), "filtering mode (1, 2, 3 or 4). Default is 2 (monolingual cross-entropy)") ("eval,e", po::value<bool>(&opt.eval)->zero_tokens()->default_value(false), "add this switch to evaluate a filtered file after computation. Eval is always done on source language") ("best-point,b", po::value<bool>(&opt.bp)->zero_tokens()->default_value(false), "add this switch to determinate the best point of a filtered file (eval option is implicit)") ("dev,d", po::value<std::string>(&opt.dev)->default_value(""), "source language dev file for eval or best point (all modes), if different from in-domain text") ("in-ttext", po::value<std::string>(&opt.inTData)->default_value(""), "in-domain target text filename, if target language (plain text or gzipped file)") ("out-ttext", po::value<std::string>(&opt.outTData)->default_value(""), "out-of-domain target text filename, if target language (plain text or gzipped file)") ("mono", po::value<bool>(&opt.mono)->zero_tokens()->default_value(false), "switch to force monolingual mode (if no target language)") ("stem", po::value<bool>(&opt.stem)->zero_tokens()->default_value(false), "switch to activate stem models computation and scoring from stem files") ("in-sstem", po::value<std::string>(&opt.inSStem)->default_value(""), "in-domain source stem filename (plain text or gzipped file)") ("in-tstem", po::value<std::string>(&opt.inTStem)->default_value(""), "in-domain target stem filename (plain text or gzipped file)") ("out-sstem", po::value<std::string>(&opt.outSStem)->default_value(""), "out-of-domain source stem filename (plain text or gzipped file)") ("out-tstem", po::value<std::string>(&opt.outTStem)->default_value(""), "out-of-domain target stem filename (plain text or gzipped file)") ("in-ptable", po::value<std::string>(&opt.iPTable)->default_value(""), "in-domain phrase table filename used in mode 4 scoring") ("out-ptable", po::value<std::string>(&opt.oPTable)->default_value(""), "out-of-domain phrase table filename used in mode 4 scoring") ("local", po::value<bool>(&opt.local)->zero_tokens()->default_value(false), "add a 7th score (local cross-entropy regarding the source phrase)") ("mean", po::value<bool>(&opt.mean)->zero_tokens()->default_value(false), "mean score from 3 OOD sample LMs instead of 1 in mode 2 & 3 (3 times slower + EXPERIMENTAL)") ("sim", po::value<bool>(&opt.sim)->zero_tokens()->default_value(false), "add similarity measures to score computing (EXPERIMENTAL, mode 2 only)") ("sim-only", po::value<bool>(&opt.simOnly)->zero_tokens()->default_value(false), "use only similarity measures (no cross-entropy)") ("vector-size", po::value<int>(&opt.vecSize)->default_value(150), "size of vector for similarity scores, default is 150 (WARNING: the more the slower)") ("step", po::value<int>(&opt.step)->default_value(10), "percentage steps for evaluation. Default is 10 (100%, 90%, ...)") ("s-vocab", po::value<std::string>(&opt.sVocab)->default_value(""), "source language vocab filename for LMs estimation. Default is in-domain source text vocab") ("t-vocab", po::value<std::string>(&opt.tVocab)->default_value(""), "target language vocab filename for LMs estimation. Default is in-domain target text vocab") ("full-vocab", po::value<bool>(&opt.fullVoc)->zero_tokens()->default_value(false), "use in-domain + out-of-domain vocabularies instead of in-domain only") ("in-slm", po::value<std::string>(&opt.inSLM)->default_value(""), "in-domain source language model (LM). Will be estimated if not present") ("out-slm", po::value<std::string>(&opt.outSLM)->default_value(""), "out-of-domain source language model (LM). Will be estimated if not present") ("in-tlm", po::value<std::string>(&opt.inTLM)->default_value(""), "in-domain target language model (LM). Will be estimated if not present") ("out-tlm", po::value<std::string>(&opt.outTLM)->default_value(""), "out-of-domain target language model (LM). Will be estimated if not present") ("order", po::value<int>(&opt.order)->default_value(4), "order for LMs. Default is 4") ("discount", po::value<int>(&opt.discount)->default_value(0), "discounting method for LM estimation. Default is modified KneserNey (0). 1 is GoodTuring, 2 is WittenBell.") ("to-lower", po::value<bool>(&opt.toLower)->default_value(false), "maps vocabulary to lower case for LM estimation. Useful for ASR. Default is false.") ("no-unkisword", po::value<bool>(&opt.noUnkIsWord)->default_value(false), "DO NOT consider <unk> and its probability as a word. Default is false, with respect to common practice.") ("bin-lm", po::value<int>(&opt.binLM)->default_value(1), "whether you want to estimate arpa.gz (0) or binary (1) LMs. Default is 1 (binary)") ("w-file", po::value<std::string>(&opt.wFile)->default_value(""), "filename for weighting the final score (one value per line)") ("log", po::value<bool>(&opt.log)->zero_tokens()->default_value(false), "switch to consider weights in w-file as log values") ("rev", po::value<bool>(&opt.rev)->zero_tokens()->default_value(false), "switch to require descending order sorted output") ("inv", po::value<bool>(&opt.inv)->zero_tokens()->default_value(false), "switch to require inversed calibrated scores (1 - score)") ("threads", po::value<int>(&opt.threads)->default_value(2), "number of threads to run for various operations (eval, sim, ...). Default is 2") ("sorted-only", po::value<bool>(&opt.sortOnly)->zero_tokens()->default_value(false), "switch to save space & time by only outputing the sorted scores file") ("help,h", "displays this help message") ("version,v", "displays program version"); po::variables_map vm; try { po::store(po::parse_command_line(argc, argv, desc), vm); if (vm.count("help") || argc == 1) { std::cout << "XenC version " + version + " PUBLIC RELEASE. Copyright 2013, Anthony Rousseau, LIUM, University of Le Mans, France." << std::endl << std::endl; std::cout << desc << std::endl; std::cout << "Filtering modes:" << std::endl << std::endl; std::cout << "For all modes (excepted 4), you must provide at least a source language, and in-domain and out-of-domain bitexts. Bitexts MUST NOT contain tabs." << std::endl; std::cout << "For every text file used, max words per line is 16384 and max chars per line is max words * 16." << std::endl << std::endl; std::cout << "Also, if no vocabularies and no language models are provided, they will be generated with the following parameters:" << std::endl; std::cout << "\t- vocabs:\tvocabularies will be created from words of in-domain bitexts." << std::endl; std::cout << "\t- LMs:\t\torder 4, modified kn-int smoothing, 0-0-0-0 cut-offs, sblm (binary) output format." << std::endl << std::endl; std::cout << "\t1:" << std::endl; std::cout << "\tSimple source language perplexity filtering. (Gao & al. 2002)" << std::endl; std::cout << "\tWill sort the out-of-domain bitext sentences (ascending order)" << std::endl; std::cout << "\tbased on perplexity scores given by a in-domain language model." << std::endl << std::endl; std::cout << "\t2:" << std::endl; std::cout << "\tSource language cross-entropy (Xen) difference filtering. (Moore & Lewis 2010)" << std::endl; std::cout << "\tWill sort the out-of-domain bitext sentences (ascending order)" << std::endl; std::cout << "\tbased on (in-source Xen - out-source Xen)." << std::endl << std::endl; std::cout << "\t3:" << std::endl; std::cout << "\tBilingual cross-entropy difference filtering. (Axelrod & al. 2011)" << std::endl; std::cout << "\tWill sort the out-of-domain bitext sentences (ascending order)" << std::endl; std::cout << "\tbased on (in-source Xen - out-source Xen) + (in-target Xen - out-target Xen)." << std::endl << std::endl; std::cout << "\t4:" << std::endl; std::cout << "\tPhrase-table scoring mode. (EXPERIMENTAL)" << std::endl; std::cout << "\tAdds the cross-entropy score of each phrase pair" << std::endl; std::cout << "\tin a phrase-table as a sixth feature of the table." << std::endl << std::endl; std::cout << "\tYou must provide:" << std::endl; std::cout << "\t\t- in-domain and out-of-domain phrase tables." << std::endl; std::cout << "\t\t- source and target vocabularies." << std::endl << std::endl; return 0; } if (vm.count("version")) { std::cout << "XenC version " + version + " PUBLIC RELEASE. Copyright 2013, Anthony Rousseau, LIUM, University of Le Mans, France." << std::endl; return 0; } po::notify(vm); } catch (po::error& e) { std::cout << desc << std::endl; std::cout << e.what() << std::endl; } } catch (std::exception& e) { std::cout << desc << std::endl; std::cout << e.what() << std::endl; } opt.pc = 0; opt.inToks = 0; opt.outToks = 0; if (opt.dev.compare("") == 0) { if (boost::filesystem::exists(opt.inSData)) { opt.dev = opt.inSData; } else { std::cerr << "You must at least specify a source in-domain corpus." << std::endl; return 1; } } // ----------------------------------------------------- // Create singletons & mode XenOption* xOpt = XenOption::getInstance(&opt); StaticData* sD = StaticData::getInstance(); boost::shared_ptr<Mode> mode; switch (xOpt->getMode()) { case 1: mode = boost::make_shared<SimplePPL>(); break; case 2: mode = boost::make_shared<MonoXEntropy>(); break; case 3: mode = boost::make_shared<BiXEntropy>(); break; case 4: mode = boost::make_shared<PTScoring>(); break; default: break; } // ----------------------------------------------------- opt.outName = getOutName(xOpt); std::string sC = sanityCheck(xOpt); // Check if all mandatory are here // ----------------------------------------------------- // LAST CHECK BEFORE OPERATIONS if (sC.compare("0") == 0) { std::cout << "Source language: " << opt.sLang << std::endl; if (!opt.mono) { std::cout << "Target language: " << opt.tLang << std::endl; } if (opt.mode != 4) { std::cout << "In-domain source data: " << opt.inSData << std::endl; std::cout << "Out-of-domain source data: " << opt.outSData << std::endl; if (opt.stem) { std::cout << "In-domain source stem file: " << opt.inSStem << std::endl; std::cout << "Out-of-domain source stem file: " << opt.outSStem << std::endl; } if (!opt.mono) { std::cout << "In-domain target data: " << opt.inTData << std::endl; std::cout << "Out-of-domain target data: " << opt.outTData << std::endl; if (opt.stem) { std::cout << "In-domain target stem file: " << opt.inTStem << std::endl; std::cout << "Out-of-domain target stem file: " << opt.outTStem << std::endl; } } } else { std::cout << "Source vocabulary: " << opt.sVocab << std::endl; std::cout << "Target vocabulary: " << opt.tVocab << std::endl; std::cout << "In-domain phrase table: " << opt.iPTable << std::endl; std::cout << "Out-of-domain phrase table: " << opt.oPTable << std::endl; std::cout << "Output phrase table: " << opt.outName << std::endl; } std::cout << "Mode: " << opt.mode << std::endl; } else { std::cerr << std::endl << sC << std::endl; sD->deleteInstance(); xOpt->deleteInstance(); return 1; } // ----------------------------------------------------- try { // Normal mode if (!xOpt->getEval()&& !xOpt->getBp()) { int ret = mode->launch(); if (ret == 0) { xOpt->deleteInstance(); sD->deleteInstance(); return 0; } else { std::cerr << "Something went wrong." << std::endl; xOpt->deleteInstance(); sD->deleteInstance(); return 1; } } // Eval or BP else { std::string sortedName = xOpt->getOutName() + ".sorted.gz"; std::string distName = xOpt->getOutName() + ".dist"; std::string bpName = xOpt->getOutName() + ".bp"; std::cout << "Sorted output used: " + sortedName << std::endl; // ----------------------------------------------------- // Proceed to normal mode if not done before if (!boost::filesystem::exists(sortedName)) { int ret = mode->launch(); if (ret != 0) { std::cerr << "Something went wrong." << std::endl; xOpt->deleteInstance(); sD->deleteInstance(); return 1; } } // ----------------------------------------------------- boost::shared_ptr<XenFile> sorted = boost::make_shared<XenFile>(); sorted->initialize(sortedName); sD->getXenResult()->initialize(sorted); sD->getDevCorp()->initialize(xOpt->getDev(), xOpt->getSLang()); // Eval if (xOpt->getEval()) { boost::shared_ptr<Eval> ptrEval = boost::make_shared<Eval>(); ptrEval->doEval(100, 0); int oldStep = xOpt->getStep(); xOpt->setStep(2); ptrEval->doEval(8, 0); xOpt->setStep(oldStep); XenIO::writeEval(ptrEval->getDist(), distName); } // BP else if (xOpt->getBp()) { boost::shared_ptr<Eval> ptrEval; if (boost::filesystem::exists(distName.c_str())) ptrEval = boost::make_shared<Eval>(distName); else { ptrEval = boost::make_shared<Eval>(); ptrEval->doEval(100, 0); int oldStep = xOpt->getStep(); xOpt->setStep(2); ptrEval->doEval(8, 0); xOpt->setStep(oldStep); XenIO::writeEval(ptrEval->getDist(), distName); } ptrEval->doBP(); XenIO::writeEval(ptrEval->getDist(), bpName); } else { return 1; } } } catch (XenCommon::XenCEption &e) { throw; } xOpt->deleteInstance(); sD->deleteInstance(); return 0; }
int XenLMken::createLM() { XenOption* opt = XenOption::getInstance(); if (boost::filesystem::exists(lmFile)) std::cout << "LM file already here, reusing..." << std::endl; else { lm::builder::PipelineConfig pipeline; std::string text, intermediate, arpa; std::vector<std::string> discount_fallback; discount_fallback.push_back("0.5"); discount_fallback.push_back("1"); discount_fallback.push_back("1.5"); bool verbose_header = true; pipeline.order = (size_t) opt->getOrder(); pipeline.initial_probs.interpolate_unigrams = true; pipeline.sort.temp_prefix = temp; pipeline.sort.total_memory = opt->getMemPc(); pipeline.minimum_block = opt->getMinBlk(); pipeline.sort.buffer_size = opt->getSortBlk(); pipeline.block_count = 2; pipeline.vocab_estimate = 1000000; pipeline.prune_vocab_file = ptrVoc->getXenFile()->getFullPath(); pipeline.prune_vocab = true; pipeline.vocab_size_for_unk = 0; pipeline.disallowed_symbol_action = lm::THROW_UP; lm::builder::Discount dis; dis.amount[0] = 0.0; for (unsigned i = 0; i < 3; ++i) { float discount = boost::lexical_cast<float>(discount_fallback[i < discount_fallback.size() ? i : (discount_fallback.size() - 1)]); UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "]."); dis.amount[i + 1] = discount; } pipeline.discount.fallback = dis; pipeline.discount.bad_action = lm::COMPLAIN; pipeline.prune_thresholds.resize(order, 0); util::NormalizeTempPrefix(pipeline.sort.temp_prefix); lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; initial.adder_in.total_memory = 32768; initial.adder_in.block_count = 2; initial.adder_out.total_memory = 32768; initial.adder_out.block_count = 2; pipeline.read_backoffs = initial.adder_out; pipeline.renumber_vocabulary = false; pipeline.output_q = false; text = textFile; arpa = lmFile; util::scoped_fd in(util::OpenReadOrThrow(text.c_str())); util::scoped_fd out(util::CreateOrThrow(arpa.c_str())); try { lm::builder::Output output(pipeline.sort.temp_prefix, false, pipeline.output_q); output.Add(new lm::builder::PrintHook(out.release(), verbose_header)); lm::builder::Pipeline(pipeline, in.release(), output); } catch (const util::MallocException &e) { std::cerr << e.what() << std::endl; std::cerr << "Try rerunning with a more conservative -S setting than " << XenCommon::toString(mempc) << std::endl; } std::cout << "LM estimation done." << std::endl; } return 0; }