Esempio n. 1
0
XenLMken::XenLMken() {
    XenOption* opt = XenOption::getInstance();

    order = (unsigned int) opt->getOrder();
    mempc = opt->getMemPc();
    temp = opt->getTemp();

    textFile = NULL;
    lmFile = NULL;
    
    pc = 0;
}
Esempio n. 2
0
int XenLMken::createLM() {
    XenOption* opt = XenOption::getInstance();

    if (boost::filesystem::exists(lmFile))
        std::cout << "LM file already here, reusing..." << std::endl;
    else {
        lm::builder::PipelineConfig pipeline;

        std::string text, intermediate, arpa;
        std::vector<std::string> discount_fallback;
        discount_fallback.push_back("0.5");
        discount_fallback.push_back("1");
        discount_fallback.push_back("1.5");
        bool verbose_header = true;

        pipeline.order = (size_t) opt->getOrder();
        pipeline.initial_probs.interpolate_unigrams = true;
        pipeline.sort.temp_prefix = temp;
        pipeline.sort.total_memory = opt->getMemPc();
        pipeline.minimum_block = opt->getMinBlk();
        pipeline.sort.buffer_size = opt->getSortBlk();
        pipeline.block_count = 2;
        pipeline.vocab_estimate = 1000000;
        pipeline.prune_vocab_file = ptrVoc->getXenFile()->getFullPath();
        pipeline.prune_vocab = true;
        pipeline.vocab_size_for_unk = 0;
        pipeline.disallowed_symbol_action = lm::THROW_UP;

        lm::builder::Discount dis;
        dis.amount[0] = 0.0;
        for (unsigned i = 0; i < 3; ++i) {
            float discount = boost::lexical_cast<float>(discount_fallback[i < discount_fallback.size() ? i : (discount_fallback.size() - 1)]);
            UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "].");
            dis.amount[i + 1] = discount;
        }

        pipeline.discount.fallback = dis;
        pipeline.discount.bad_action = lm::COMPLAIN;
        pipeline.prune_thresholds.resize(order, 0);

        util::NormalizeTempPrefix(pipeline.sort.temp_prefix);

        lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs;
        initial.adder_in.total_memory = 32768;
        initial.adder_in.block_count = 2;
        initial.adder_out.total_memory = 32768;
        initial.adder_out.block_count = 2;
        pipeline.read_backoffs = initial.adder_out;

        pipeline.renumber_vocabulary = false;
        pipeline.output_q = false;

        text = textFile;
        arpa = lmFile;

        util::scoped_fd in(util::OpenReadOrThrow(text.c_str()));
        util::scoped_fd out(util::CreateOrThrow(arpa.c_str()));

        try {
            lm::builder::Output output(pipeline.sort.temp_prefix, false, pipeline.output_q);
            output.Add(new lm::builder::PrintHook(out.release(), verbose_header));
            lm::builder::Pipeline(pipeline, in.release(), output);
        } catch (const util::MallocException &e) {
            std::cerr << e.what() << std::endl;
            std::cerr << "Try rerunning with a more conservative -S setting than " << XenCommon::toString(mempc) << std::endl;
        }

        std::cout << "LM estimation done." << std::endl;
    }
    
    return 0;
}