XenLMken::XenLMken() { XenOption* opt = XenOption::getInstance(); order = (unsigned int) opt->getOrder(); mempc = opt->getMemPc(); temp = opt->getTemp(); textFile = NULL; lmFile = NULL; pc = 0; }
int XenLMken::createLM() { XenOption* opt = XenOption::getInstance(); if (boost::filesystem::exists(lmFile)) std::cout << "LM file already here, reusing..." << std::endl; else { lm::builder::PipelineConfig pipeline; std::string text, intermediate, arpa; std::vector<std::string> discount_fallback; discount_fallback.push_back("0.5"); discount_fallback.push_back("1"); discount_fallback.push_back("1.5"); bool verbose_header = true; pipeline.order = (size_t) opt->getOrder(); pipeline.initial_probs.interpolate_unigrams = true; pipeline.sort.temp_prefix = temp; pipeline.sort.total_memory = opt->getMemPc(); pipeline.minimum_block = opt->getMinBlk(); pipeline.sort.buffer_size = opt->getSortBlk(); pipeline.block_count = 2; pipeline.vocab_estimate = 1000000; pipeline.prune_vocab_file = ptrVoc->getXenFile()->getFullPath(); pipeline.prune_vocab = true; pipeline.vocab_size_for_unk = 0; pipeline.disallowed_symbol_action = lm::THROW_UP; lm::builder::Discount dis; dis.amount[0] = 0.0; for (unsigned i = 0; i < 3; ++i) { float discount = boost::lexical_cast<float>(discount_fallback[i < discount_fallback.size() ? i : (discount_fallback.size() - 1)]); UTIL_THROW_IF(discount < 0.0 || discount > static_cast<float>(i+1), util::Exception, "The discount for count " << (i+1) << " was parsed as " << discount << " which is not in the range [0, " << (i+1) << "]."); dis.amount[i + 1] = discount; } pipeline.discount.fallback = dis; pipeline.discount.bad_action = lm::COMPLAIN; pipeline.prune_thresholds.resize(order, 0); util::NormalizeTempPrefix(pipeline.sort.temp_prefix); lm::builder::InitialProbabilitiesConfig &initial = pipeline.initial_probs; initial.adder_in.total_memory = 32768; initial.adder_in.block_count = 2; initial.adder_out.total_memory = 32768; initial.adder_out.block_count = 2; pipeline.read_backoffs = initial.adder_out; pipeline.renumber_vocabulary = false; pipeline.output_q = false; text = textFile; arpa = lmFile; util::scoped_fd in(util::OpenReadOrThrow(text.c_str())); util::scoped_fd out(util::CreateOrThrow(arpa.c_str())); try { lm::builder::Output output(pipeline.sort.temp_prefix, false, pipeline.output_q); output.Add(new lm::builder::PrintHook(out.release(), verbose_header)); lm::builder::Pipeline(pipeline, in.release(), output); } catch (const util::MallocException &e) { std::cerr << e.what() << std::endl; std::cerr << "Try rerunning with a more conservative -S setting than " << XenCommon::toString(mempc) << std::endl; } std::cout << "LM estimation done." << std::endl; } return 0; }