Ejemplo n.º 1
0
void Eval::doBP() {
    XenOption* opt = XenOption::getInstance();
    
	double lower = 999999999999;
	int lBound = 0;
	int hBound = 0;

	for(EvalMap::iterator it = ptrDist->begin() ; it != ptrDist->end() ; ++it)
        if (it->second < lower) {
            lower = it->second;
            EvalMap::iterator itH = it;
            if (it != ptrDist->begin())
                --itH;
            EvalMap::iterator itL = it;
            if (it != ptrDist->end())
                ++itL;
            lBound = itL->first;
            hBound = itH->first;
            if (lBound > it->first) { lBound = 0; }
            if (hBound < it->first) { hBound = opt->getMaxEvalPC(); }
        }
    
	opt->setStep(opt->getStep() / 2);
    std::cout << "Requiring eval from " + XenCommon::toString(hBound) + " to " + XenCommon::toString(lBound) + " with step " + XenCommon::toString(opt->getStep()) << std::endl;
	doEval(hBound, lBound);
    
	if (opt->getStep() > 1) {
		doBP();
	}
}
Ejemplo n.º 2
0
XenLMken::XenLMken() {
    XenOption* opt = XenOption::getInstance();

    order = (unsigned int) opt->getOrder();
    mempc = opt->getMemPc();
    temp = opt->getTemp();

    textFile = NULL;
    lmFile = NULL;
    
    pc = 0;
}
Ejemplo n.º 3
0
void Similarity::buildIDVector() {
    std::cout << "Building in-domain vector." << std::endl;
    
    XenOption* opt = XenOption::getInstance();
    
    std::map<std::string, float> meanTFIDF;
    
    for (unsigned int i = 0; i < ptrWords->size(); i++)
        meanTFIDF[ptrWords->operator[](i)] = (float)(ptrIdTfIdf->operator[](ptrWords->operator[](i)) + ptrOodTfIdf->operator[](ptrWords->operator[](i))) / (float)2;
    
    std::multimap<float, std::string, std::greater<float> > meanTFIDFByScore = XenCommon::flip_map(meanTFIDF);
    
    int count = 0;
    float idFreqThres = (float)ptrID->getSize() / 300; // Threshold 0,33% of words is the same
    float oodFreqThres = (float)ptrOOD->getSize() / 300; // Threshold 0,33% of words is the same
    
    for (std::multimap<float, std::string, std::greater<float> >::iterator it = meanTFIDFByScore.begin(); count < opt->getVecSize(); ++it) {
        count++;
        
        int idTf = ptrIdTf->operator[](it->second);
        int oodTf = ptrOodTf->operator[](it->second);
        
        if (idTf > idFreqThres || oodTf > oodFreqThres) {
            count--;
        }
        else {
            ptrIdVecTfIdf->operator[](count - 1) = it->first;
            ptrVecWords->operator[](count - 1) = it->second;
        }
    }
    
    /** @todo DEBUG, needs more testing */
    std::string outName = ptrID->getXenFile()->getPrefix() + ".vec";
    
    std::ofstream out(outName.c_str(), std::ios::out | std::ios::trunc);
    
    for (int i = 0; i < opt->getVecSize(); i++) {
        out << ptrVecWords->operator[](i) << "\t\t" << ptrIdVecTfIdf->operator[](i) << std::endl;
    }
    
    out.close();
    
    std::cout << "Done building in-domain vector." << std::endl;
}
Ejemplo n.º 4
0
void XenIO::writeNewPT(boost::shared_ptr<PhraseTable> ptrPT, boost::shared_ptr<Score> ptrScore) {
    XenOption* opt = XenOption::getInstance();
    Score sc2;
    
    if (opt->getLocal()) {
        std::vector<SourcePhrase> srcPh = ptrPT->getSrcPhrases();
    
        for (unsigned int i = 0; i < srcPh.size(); i++) {
            SourcePhrase sP = srcPh[i];
        
            for (unsigned int j = 0; j < sP.getScoresXE()->getSize(); j++) {
                sc2.addScore(sP.getScoresXE()->getScore(j));
            }
        }
    }
    
    try {
        std::cout << "Writing new phrase-table to " + opt->getOutName() << std::endl;
        
        std::string oF = opt->getOutName();
        
        boost::iostreams::filtering_ostream out;
        out.push(boost::iostreams::gzip_compressor());
        out.push(boost::iostreams::file_sink(oF.c_str(), std::ios_base::out | std::ios_base::binary));
        
        if (!out.good())
            throw XenCommon::XenCEption("Can't write to " + opt->getOutName() + ".gz");

        for (unsigned int i = 0; i < ptrScore->getSize(); i++) {
            out << ptrPT->getSource(i) << " ||| " << ptrPT->getTarget(i) << " ||| " << ptrPT->getScores(i) << " " << XenCommon::toString(ptrScore->getScore(i));
            if (opt->getLocal())
                out << " " << XenCommon::toString(sc2.getScore(i));
            out << " ||| " << ptrPT->getAlignment(i) << " ||| " << ptrPT->getCounts(i) << std::endl;
            
            if (out.bad())
                throw XenCommon::XenCEption("Something went wrong in output stream...");
        }
        
        out.flush();
        out.reset();
    } catch (XenCommon::XenCEption &e) {
        throw;
    }
}
Ejemplo n.º 5
0
void Eval::doEval(int high, int low) {
    XenOption* opt = XenOption::getInstance();
    StaticData* sD = StaticData::getInstance();
    
    if (opt->getSVocab()->getFileName().compare("") == 0) { sD->getVocabs()->getPtrSourceVoc()->initialize(sD->getSourceCorps()->getPtrInCorp()); }
    else { sD->getVocabs()->getPtrSourceVoc()->initialize(opt->getSVocab()); }
    
    pool threadPool(opt->getThreads());
    
    int pc = low;
    if (pc == 0) { pc = opt->getStep(); }

    std::vector<std::string> parts;

	while (pc <= high) {
        EvalMap::iterator found = ptrDist->find(pc);
        if (found == ptrDist->end()) {
            std::string partName = sD->getXenResult()->getXenFile()->getDirName() + "/" + sD->getXenResult()->getXenFile()->getPrefix() + "-" + XenCommon::toString(pc) + "pc.gz";
            XenIO::writeXRpart(sD->getXenResult(), pc, partName);

            boost::shared_ptr<Corpus> c = boost::make_shared<Corpus>();
            c->initialize(partName, "xx");

            threadPool.schedule(
                    boost::bind(taskEval, pc, c, sD->getVocabs()->getPtrSourceVoc(), sD->getDevCorp(),
                                ptrDist));

            parts.push_back(partName);
        }
        pc += opt->getStep();
	}
    
    threadPool.wait();

    for (int i = 0; i < parts.size(); i++)
        XenIO::delFile(parts[i]);

    std::cout << "Evaluation done." << std::endl;
}
Ejemplo n.º 6
0
int SimplePPL::launch() {
    XenOption* opt = XenOption::getInstance();
    StaticData* sD = StaticData::getInstance();
    
    sD->getSourceCorps()->getPtrInCorp()->initialize(opt->getInSData(), opt->getSLang());
    sD->getSourceCorps()->getPtrOutCorp()->initialize(opt->getOutSData(), opt->getSLang());
    
    if (opt->getSVocab()->getFileName().compare("") == 0) {
        if (opt->getFullVocab())
            sD->getVocabs()->getPtrSourceVoc()->initialize(sD->getSourceCorps()->getPtrInCorp(), sD->getSourceCorps()->getPtrOutCorp());
        else
            sD->getVocabs()->getPtrSourceVoc()->initialize(sD->getSourceCorps()->getPtrInCorp());
    }
    else
        sD->getVocabs()->getPtrSourceVoc()->initialize(opt->getSVocab());
    
    if (opt->getInSLM()->getFileName().compare("") == 0) {
        sD->getSourceLMs()->getPtrInLM()->initialize(sD->getSourceCorps()->getPtrInCorp(), sD->getVocabs()->getPtrSourceVoc());
        sD->getSourceLMs()->getPtrInLM()->createLM();
        sD->getSourceLMs()->getPtrInLM()->writeLM();
    }
    else {
        sD->getSourceLMs()->getPtrInLM()->initialize(opt->getInSLM(), sD->getVocabs()->getPtrSourceVoc());
        sD->getSourceLMs()->getPtrInLM()->loadLM();
    }
    
    if (!boost::filesystem::exists(sD->getSourceLMs()->getPtrInLM()->getFileName())) {
        std::cout << "Error: LM file " + sD->getSourceLMs()->getPtrInLM()->getFileName() + " does not exists!" << std::endl;
        return 1;
    }
    
    sD->getSourcePPLs()->getPtrInPPL()->initialize(sD->getSourceCorps()->getPtrOutCorp(), sD->getSourceLMs()->getPtrInLM());
    sD->getSourcePPLs()->getPtrInPPL()->calcPPLCorpus();
    
    if (opt->getWFile()->getFileName().compare("") != 0)
        sD->getWeightsFile()->initialize(opt->getWFile());
    
    for (unsigned int i = 0; i < sD->getSourcePPLs()->getPtrInPPL()->getSize(); i++) {
        double res = sD->getSourcePPLs()->getPtrInPPL()->getPPL(i);
        
        if (opt->getWFile()->getFileName().compare("") != 0)
            res = res * sD->getWeightsFile()->getWeight(i);
        
        sD->getScHold()->getPtrScores()->addScore(res);
    }
    
    sD->getScHold()->getPtrScores()->calibrate();
    if (opt->getInv()) { sD->getScHold()->getPtrScores()->inverse(); }
    
    if (opt->getTLang().compare("") == 0) {
        std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (unclean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) << std::endl;
        XenIO::cleanCorpusMono(sD->getSourceCorps()->getPtrOutCorp(), sD->getScHold()->getPtrScores());
        std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (clean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) << std::endl;
        XenIO::writeMonoOutput(sD->getSourceCorps()->getPtrOutCorp(), sD->getScHold()->getPtrScores());
    }
    else {
        boost::shared_ptr<Corpus> ptrOTCorp = boost::make_shared<Corpus>();
        ptrOTCorp->initialize(opt->getOutTData(), opt->getTLang());
        std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (unclean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) + " NB Target corp (unclean): " + XenCommon::toString(ptrOTCorp->getSize()) << std::endl;
        XenIO::cleanCorpusBi(sD->getSourceCorps()->getPtrOutCorp(), ptrOTCorp, sD->getScHold()->getPtrScores());
        std::cout << "NB Scores: " + XenCommon::toString(sD->getScHold()->getPtrScores()->getSize()) + " NB Source corp (clean): " + XenCommon::toString(sD->getSourceCorps()->getPtrOutCorp()->getSize()) + " NB Target corp (clean): " + XenCommon::toString(ptrOTCorp->getSize()) << std::endl;
        XenIO::writeBiOutput(sD->getSourceCorps()->getPtrOutCorp(), ptrOTCorp, sD->getScHold()->getPtrScores());
    }
    
    return 0;
}
Ejemplo n.º 7
0
void XenIO::writeMonoOutput(boost::shared_ptr<Corpus> ptrCorp, boost::shared_ptr<Score> ptrScore) {
    XenOption* opt = XenOption::getInstance();
    
    std::string scoredName = opt->getOutName() + ".scored.gz";
    std::string sortedName = opt->getOutName() + ".sorted.gz";
    
    std::multimap<double, std::string> sortMap;
    
    for (unsigned int i = 0; i < ptrCorp->getSize(); i++)
        if (ptrCorp->getPrint(i) && ptrScore->getPrint(i)) {
            std::pair<double, std::string> p(ptrScore->getScore(i), ptrCorp->getLine(i));
            sortMap.insert(p);
        }
    
    try {
        if (!opt->getSortOnly()) {
            std::cout << "Writing scored output to " + scoredName << std::endl;
            
            
            boost::iostreams::filtering_ostream out;
            out.push(boost::iostreams::gzip_compressor());
            out.push(boost::iostreams::file_sink(scoredName.c_str(), std::ios_base::out | std::ios_base::binary));
            out.setf(std::ios::fixed | std::ios::showpoint);
            out.precision(15);
            
            if (!out.good())
                throw XenCommon::XenCEption("Something went wrong in output stream...");
            
            for (unsigned int i = 0; i < ptrCorp->getSize(); i++) {
                if (ptrCorp->getPrint(i) && ptrScore->getPrint(i))
                    out << XenCommon::toString(ptrScore->getScore(i)) << '\t' << ptrCorp->getLine(i) << std::endl;
            
                if (out.bad())
                    throw XenCommon::XenCEption("Something went wrong in output stream...");
            }
            
            out.flush();
            out.reset();
        }
        
        std::cout << "Writing sorted output to " + sortedName << std::endl;
        
        boost::iostreams::filtering_ostream out;
        out.push(boost::iostreams::gzip_compressor());
        out.push(boost::iostreams::file_sink(sortedName.c_str(), std::ios_base::out | std::ios_base::binary));
        out.setf(std::ios::fixed | std::ios::showpoint);
        out.precision(15);
        
        if (!out.good())
            throw XenCommon::XenCEption("Something went wrong in output stream...");
        
        if (opt->getRev()) {
            for (std::multimap<double, std::string>::reverse_iterator it = sortMap.rbegin(); it != sortMap.rend(); ++it) {
                out << XenCommon::toString(it->first) << '\t' << it->second << std::endl;
                
                if (out.bad())
                    throw XenCommon::XenCEption("Something went wrong in output stream...");
            }
        }
        else {
            for (std::multimap<double, std::string>::iterator it = sortMap.begin(); it != sortMap.end(); ++it) {
                out << XenCommon::toString(it->first) << '\t' << it->second << std::endl;
                
                if (out.bad())
                    throw XenCommon::XenCEption("Something went wrong in output stream...");
            }
        }

        out.flush();
        out.reset();
    } catch (XenCommon::XenCEption &e) {
        throw;
    }
}
Ejemplo n.º 8
0
int main(int argc, char* argv[]) {
    po::options_description desc("XenC options", 200);
    Options opt;
    
    try {
        desc.add_options()
        ("source,s", po::value<std::string>(&opt.sLang)->required(), "source language (fr, en, ...)")
        ("target,t", po::value<std::string>(&opt.tLang)->default_value(""), "target language (if relevant)")
        ("in-stext,i", po::value<std::string>(&opt.inSData)->required(), "in-domain source text filename (plain text or gzipped file)")
        ("out-stext,o", po::value<std::string>(&opt.outSData)->required(), "out-of-domain source text filename (plain text or gzipped file)")
        ("mode,m", po::value<int>(&opt.mode)->required()->default_value(2), "filtering mode (1, 2, 3 or 4). Default is 2 (monolingual cross-entropy)")
        ("eval,e", po::value<bool>(&opt.eval)->zero_tokens()->default_value(false), "add this switch to evaluate a filtered file after computation. Eval is always done on source language")
        ("best-point,b", po::value<bool>(&opt.bp)->zero_tokens()->default_value(false), "add this switch to determinate the best point of a filtered file (eval option is implicit)")
        ("dev,d", po::value<std::string>(&opt.dev)->default_value(""), "source language dev file for eval or best point (all modes), if different from in-domain text")
        ("in-ttext", po::value<std::string>(&opt.inTData)->default_value(""), "in-domain target text filename, if target language (plain text or gzipped file)")
        ("out-ttext", po::value<std::string>(&opt.outTData)->default_value(""), "out-of-domain target text filename, if target language (plain text or gzipped file)")
        ("mono", po::value<bool>(&opt.mono)->zero_tokens()->default_value(false), "switch to force monolingual mode (if no target language)")
        ("stem", po::value<bool>(&opt.stem)->zero_tokens()->default_value(false), "switch to activate stem models computation and scoring from stem files")
        ("in-sstem", po::value<std::string>(&opt.inSStem)->default_value(""), "in-domain source stem filename (plain text or gzipped file)")
        ("in-tstem", po::value<std::string>(&opt.inTStem)->default_value(""), "in-domain target stem filename (plain text or gzipped file)")
        ("out-sstem", po::value<std::string>(&opt.outSStem)->default_value(""), "out-of-domain source stem filename (plain text or gzipped file)")
        ("out-tstem", po::value<std::string>(&opt.outTStem)->default_value(""), "out-of-domain target stem filename (plain text or gzipped file)")
        ("in-ptable", po::value<std::string>(&opt.iPTable)->default_value(""), "in-domain phrase table filename used in mode 4 scoring")
        ("out-ptable", po::value<std::string>(&opt.oPTable)->default_value(""), "out-of-domain phrase table filename used in mode 4 scoring")
        ("local", po::value<bool>(&opt.local)->zero_tokens()->default_value(false), "add a 7th score (local cross-entropy regarding the source phrase)")
        ("mean", po::value<bool>(&opt.mean)->zero_tokens()->default_value(false), "mean score from 3 OOD sample LMs instead of 1 in mode 2 & 3 (3 times slower + EXPERIMENTAL)")
        ("sim", po::value<bool>(&opt.sim)->zero_tokens()->default_value(false), "add similarity measures to score computing (EXPERIMENTAL, mode 2 only)")
        ("sim-only", po::value<bool>(&opt.simOnly)->zero_tokens()->default_value(false), "use only similarity measures (no cross-entropy)")
        ("vector-size", po::value<int>(&opt.vecSize)->default_value(150), "size of vector for similarity scores, default is 150 (WARNING: the more the slower)")
        ("step", po::value<int>(&opt.step)->default_value(10), "percentage steps for evaluation. Default is 10 (100%, 90%, ...)")
        ("s-vocab", po::value<std::string>(&opt.sVocab)->default_value(""), "source language vocab filename for LMs estimation. Default is in-domain source text vocab")
        ("t-vocab", po::value<std::string>(&opt.tVocab)->default_value(""), "target language vocab filename for LMs estimation. Default is in-domain target text vocab")
        ("full-vocab", po::value<bool>(&opt.fullVoc)->zero_tokens()->default_value(false), "use in-domain + out-of-domain vocabularies instead of in-domain only")
        ("in-slm", po::value<std::string>(&opt.inSLM)->default_value(""), "in-domain source language model (LM). Will be estimated if not present")
        ("out-slm", po::value<std::string>(&opt.outSLM)->default_value(""), "out-of-domain source language model (LM). Will be estimated if not present")
        ("in-tlm", po::value<std::string>(&opt.inTLM)->default_value(""), "in-domain target language model (LM). Will be estimated if not present")
        ("out-tlm", po::value<std::string>(&opt.outTLM)->default_value(""), "out-of-domain target language model (LM). Will be estimated if not present")
        ("order", po::value<int>(&opt.order)->default_value(4), "order for LMs. Default is 4")
        ("discount", po::value<int>(&opt.discount)->default_value(0), "discounting method for LM estimation. Default is modified KneserNey (0). 1 is GoodTuring, 2 is WittenBell.")
        ("to-lower", po::value<bool>(&opt.toLower)->default_value(false), "maps vocabulary to lower case for LM estimation. Useful for ASR. Default is false.")
        ("no-unkisword", po::value<bool>(&opt.noUnkIsWord)->default_value(false), "DO NOT consider <unk> and its probability as a word. Default is false, with respect to common practice.")
        ("bin-lm", po::value<int>(&opt.binLM)->default_value(1), "whether you want to estimate arpa.gz (0) or binary (1) LMs. Default is 1 (binary)")
        ("w-file", po::value<std::string>(&opt.wFile)->default_value(""), "filename for weighting the final score (one value per line)")
        ("log", po::value<bool>(&opt.log)->zero_tokens()->default_value(false), "switch to consider weights in w-file as log values")
        ("rev", po::value<bool>(&opt.rev)->zero_tokens()->default_value(false), "switch to require descending order sorted output")
        ("inv", po::value<bool>(&opt.inv)->zero_tokens()->default_value(false), "switch to require inversed calibrated scores (1 - score)")
        ("threads", po::value<int>(&opt.threads)->default_value(2), "number of threads to run for various operations (eval, sim, ...). Default is 2")
        ("sorted-only", po::value<bool>(&opt.sortOnly)->zero_tokens()->default_value(false), "switch to save space & time by only outputing the sorted scores file")
        ("help,h", "displays this help message")
        ("version,v", "displays program version");
        
        po::variables_map vm;
        
        try {
            po::store(po::parse_command_line(argc, argv, desc), vm);
            
            if (vm.count("help") || argc == 1) {
                std::cout << "XenC version " + version + " PUBLIC RELEASE. Copyright 2013, Anthony Rousseau, LIUM, University of Le Mans, France." << std::endl << std::endl;
                
                std::cout << desc << std::endl;
                
                std::cout << "Filtering modes:" << std::endl << std::endl;
                std::cout << "For all modes (excepted 4), you must provide at least a source language, and in-domain and out-of-domain bitexts. Bitexts MUST NOT contain tabs." << std::endl;
                std::cout << "For every text file used, max words per line is 16384 and max chars per line is max words * 16." << std::endl << std::endl;
                std::cout << "Also, if no vocabularies and no language models are provided, they will be generated with the following parameters:" << std::endl;
                std::cout << "\t- vocabs:\tvocabularies will be created from words of in-domain bitexts." << std::endl;
                std::cout << "\t- LMs:\t\torder 4, modified kn-int smoothing, 0-0-0-0 cut-offs, sblm (binary) output format." << std::endl << std::endl;
                std::cout << "\t1:" << std::endl;
                std::cout << "\tSimple source language perplexity filtering. (Gao & al. 2002)" << std::endl;
                std::cout << "\tWill sort the out-of-domain bitext sentences (ascending order)" << std::endl;
                std::cout << "\tbased on perplexity scores given by a in-domain language model." << std::endl << std::endl;
                std::cout << "\t2:" << std::endl;
                std::cout << "\tSource language cross-entropy (Xen) difference filtering. (Moore & Lewis 2010)" << std::endl;
                std::cout << "\tWill sort the out-of-domain bitext sentences (ascending order)" << std::endl;
                std::cout << "\tbased on (in-source Xen - out-source Xen)." << std::endl << std::endl;
                std::cout << "\t3:" << std::endl;
                std::cout << "\tBilingual cross-entropy difference filtering. (Axelrod & al. 2011)" << std::endl;
                std::cout << "\tWill sort the out-of-domain bitext sentences (ascending order)" << std::endl;
                std::cout << "\tbased on (in-source Xen - out-source Xen) + (in-target Xen - out-target Xen)." << std::endl << std::endl;
                std::cout << "\t4:" << std::endl;
                std::cout << "\tPhrase-table scoring mode. (EXPERIMENTAL)" << std::endl;
                std::cout << "\tAdds the cross-entropy score of each phrase pair" << std::endl;
                std::cout << "\tin a phrase-table as a sixth feature of the table." << std::endl << std::endl;
                std::cout << "\tYou must provide:" << std::endl;
                std::cout << "\t\t- in-domain and out-of-domain phrase tables." << std::endl;
                std::cout << "\t\t- source and target vocabularies." << std::endl << std::endl;
                
                return 0;
            }
            
            if (vm.count("version")) {
                std::cout << "XenC version " + version + " PUBLIC RELEASE. Copyright 2013, Anthony Rousseau, LIUM, University of Le Mans, France." << std::endl;
                return 0;
            }
            
            po::notify(vm);
        } catch (po::error& e) {
            std::cout << desc << std::endl;
            std::cout << e.what() << std::endl;
        }
    } catch (std::exception& e) {
        std::cout << desc << std::endl;
        std::cout << e.what() << std::endl;
    }

    opt.pc = 0;
    opt.inToks = 0;
    opt.outToks = 0;
    
    if (opt.dev.compare("") == 0) {
        if (boost::filesystem::exists(opt.inSData)) {
            opt.dev = opt.inSData;
        }
        else {
            std::cerr << "You must at least specify a source in-domain corpus." << std::endl;
            return 1;
        }
    }
    
    // -----------------------------------------------------
    // Create singletons & mode
    XenOption* xOpt = XenOption::getInstance(&opt);
    StaticData* sD = StaticData::getInstance();
    
    boost::shared_ptr<Mode> mode;
    
    switch (xOpt->getMode()) {
        case 1:
            mode = boost::make_shared<SimplePPL>();
            break;
            
        case 2:
            mode = boost::make_shared<MonoXEntropy>();
            break;
            
        case 3:
            mode = boost::make_shared<BiXEntropy>();
            break;
            
        case 4:
            mode = boost::make_shared<PTScoring>();
            break;
            
        default:
            break;
    }
    // -----------------------------------------------------
    
    opt.outName = getOutName(xOpt);
    std::string sC = sanityCheck(xOpt);          // Check if all mandatory are here

    // -----------------------------------------------------
    // LAST CHECK BEFORE OPERATIONS
	if (sC.compare("0") == 0) {
		std::cout << "Source language: " << opt.sLang << std::endl;
		if (!opt.mono) { std::cout << "Target language: " << opt.tLang << std::endl; }
        if (opt.mode != 4) {
            std::cout << "In-domain source data: " << opt.inSData << std::endl;
            std::cout << "Out-of-domain source data: " << opt.outSData << std::endl;
            if (opt.stem) {
                std::cout << "In-domain source stem file: " << opt.inSStem << std::endl;
                std::cout << "Out-of-domain source stem file: " << opt.outSStem << std::endl;
            }
            if (!opt.mono) {
                std::cout << "In-domain target data: " << opt.inTData << std::endl;
                std::cout << "Out-of-domain target data: " << opt.outTData << std::endl;
                if (opt.stem) {
                    std::cout << "In-domain target stem file: " << opt.inTStem << std::endl;
                    std::cout << "Out-of-domain target stem file: " << opt.outTStem << std::endl;
                }
            }
        }
        else {
            std::cout << "Source vocabulary: " << opt.sVocab << std::endl;
            std::cout << "Target vocabulary: " << opt.tVocab << std::endl;
            std::cout << "In-domain phrase table: " << opt.iPTable << std::endl;
            std::cout << "Out-of-domain phrase table: " << opt.oPTable << std::endl;
            std::cout << "Output phrase table: " << opt.outName << std::endl;
        }
		std::cout << "Mode: " << opt.mode << std::endl;
	}
	else {
        std::cerr << std::endl << sC << std::endl;
        sD->deleteInstance();
        xOpt->deleteInstance();
		return 1;
	}
    // -----------------------------------------------------

    try {
        // Normal mode
        if (!xOpt->getEval()&& !xOpt->getBp()) {
            int ret = mode->launch();
            
            if (ret == 0) {
                xOpt->deleteInstance();
                sD->deleteInstance();
                return 0;
            }
            else {
                std::cerr << "Something went wrong." << std::endl;
                xOpt->deleteInstance();
                sD->deleteInstance();
                return 1;
            }
        }
        // Eval or BP
        else {
            std::string sortedName = xOpt->getOutName() + ".sorted.gz";
            std::string distName = xOpt->getOutName() + ".dist";
            std::string bpName = xOpt->getOutName() + ".bp";
            
            std::cout << "Sorted output used: " + sortedName << std::endl;
            
            // -----------------------------------------------------
            // Proceed to normal mode if not done before
            if (!boost::filesystem::exists(sortedName)) {
                int ret = mode->launch();
                
                if (ret != 0) {
                    std::cerr << "Something went wrong." << std::endl;
                    xOpt->deleteInstance();
                    sD->deleteInstance();
                    return 1;
                }
            }
            // -----------------------------------------------------
            
            boost::shared_ptr<XenFile> sorted = boost::make_shared<XenFile>();
            sorted->initialize(sortedName);
            
            sD->getXenResult()->initialize(sorted);
            sD->getDevCorp()->initialize(xOpt->getDev(), xOpt->getSLang());
            
            // Eval
            if (xOpt->getEval()) {
                boost::shared_ptr<Eval> ptrEval = boost::make_shared<Eval>();
                
                ptrEval->doEval(100, 0);
                int oldStep = xOpt->getStep();
                xOpt->setStep(2);
                ptrEval->doEval(8, 0);
                xOpt->setStep(oldStep);
                
                XenIO::writeEval(ptrEval->getDist(), distName);
            }
            // BP
            else if (xOpt->getBp()) {
                boost::shared_ptr<Eval> ptrEval;
                
                if (boost::filesystem::exists(distName.c_str()))
                    ptrEval = boost::make_shared<Eval>(distName);
                else {
                    ptrEval = boost::make_shared<Eval>();
                    ptrEval->doEval(100, 0);
                    int oldStep = xOpt->getStep();
                    xOpt->setStep(2);
                    ptrEval->doEval(8, 0);
                    xOpt->setStep(oldStep);
                    XenIO::writeEval(ptrEval->getDist(), distName);
                }
                
                ptrEval->doBP();
                XenIO::writeEval(ptrEval->getDist(), bpName);
            }
            else { return 1; }
        }
    } catch (XenCommon::XenCEption &e) {
        throw;
    }
    
    xOpt->deleteInstance();
    sD->deleteInstance();
	return 0;
}
Ejemplo n.º 9
0
int XenLMken::createLM() {
    XenOption* opt = XenOption::getInstance();

    if (boost::filesystem::exists(lmFile))
        std::cout << "LM file already here, reusing..." << std::endl;
    else {
        lm::builder::PipelineConfig pipeline;

        std::string text, intermediate, arpa;
        std::vector<std::string> discount_fallback;
        discount_fallback.push_back("0.5");
        discount_fallback.push_back("1");
        discount_fallback.push_back("1.5");
        bool verbose_header = true;

        pipeline.order = (size_t) opt->getOrder();
        pipeline.initial_probs.interpolate_unigrams = true;
        pipeline.sort.temp_prefix = temp;
        pipeline.sort.total_memory = opt->getMemPc();
        pipeline.minimum_block = opt->getMinBlk();
        pipeline.sort.buffer_size = opt->getSortBlk();
        pipeline.block_count = 2;
        pipeline.vocab_estimate = 1000000;
        pipeline.prune_vocab_file = ptrVoc->getXenFile()->getFullPath();
        pipeline.prune_vocab = true;
        pipeline.vocab_size_for_unk = 0;
        pipeline.disallowed_symbol_action = lm::THROW_UP;

        lm::builder::Discount dis;
        dis.amount[0] = 0.0;
        for (unsigned i = 0; i < 3; ++i) {
            float discount = boost::lexical_cast<float>(discount_fallback[i < discount_fallback.size() ? i : (discount_fallback.size() - 1)]);
            UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
            dis.amount[i + 1] = discount;
        }

        pipeline.discount.fallback = dis;
        pipeline.discount.bad_action = lm::COMPLAIN;
        pipeline.prune_thresholds.resize(order, 0);

        util::NormalizeTempPrefix(pipeline.sort.temp_prefix);

        lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
        initial.adder_in.total_memory = 32768;
        initial.adder_in.block_count = 2;
        initial.adder_out.total_memory = 32768;
        initial.adder_out.block_count = 2;
        pipeline.read_backoffs = initial.adder_out;

        pipeline.renumber_vocabulary = false;
        pipeline.output_q = false;

        text = textFile;
        arpa = lmFile;

        util::scoped_fd in(util::OpenReadOrThrow(text.c_str()));
        util::scoped_fd out(util::CreateOrThrow(arpa.c_str()));

        try {
            lm::builder::Output output(pipeline.sort.temp_prefix, false, pipeline.output_q);
            output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
            lm::builder::Pipeline(pipeline, in.release(), output);
        } catch (const util::MallocException &e) {
            std::cerr << e.what() << std::endl;
            std::cerr << "Try rerunning with a more conservative -S setting than " << XenCommon::toString(mempc) << std::endl;
        }

        std::cout << "LM estimation done." << std::endl;
    }
    
    return 0;
}