void DebugLib::PrintSentencesPerplexity(const std::vector<std::vector<int> > &Sentences, const NHPYLM &LanguageModel) { int WHPYLMContextLenght = LanguageModel.GetWHPYLMOrder() - 1; double LoglikelihoodSum = 0; int NumWords = 0; for (const std::vector<int> &Sentence : Sentences) { LoglikelihoodSum += LanguageModel.WordSequenceLoglikelihood(Sentence); NumWords += Sentence.size() - WHPYLMContextLenght; } std::cout << std::setprecision(2) << std::fixed << " Perplexity: " << exp(-LoglikelihoodSum / NumWords) << std::endl << std::endl; }
NHPYLMFst::NHPYLMFst(const NHPYLM &LanguageModel_, int SentEndWordId_, const vector< bool > &ActiveWords_) : LanguageModel(LanguageModel_), SentEndWordId(SentEndWordId_), CHPYLMOrder(LanguageModel_.GetCHPYLMOrder()), WHPYLMOrder(LanguageModel_.GetWHPYLMOrder()), StartContextId(LanguageModel_.GetContextId(std::vector<int>(WHPYLMOrder - 1, SentEndWordId_))), FinalContextId(LanguageModel_.GetFinalContextId()), FSTProperties(fst::kOEpsilons | fst::kILabelSorted | fst::kOLabelSorted), FSTType("vector"), ActiveWords(ActiveWords_), FallbackSymbolId(PHI_SYMBOLID), Arcs(LanguageModel_.GetFinalContextId() + 1) { }
void DebugLib::GenerateSentencesOfWordsFromWordLM( const NHPYLM &LanguageModel, int SentEndWordId) { // Id2 word sequence vector for printing std::vector<std::string> Id2WordSequenceVector(LanguageModel.GetId2CharacterSequenceVector()); // generate sentences of words from the word language model std::vector<std::vector<int> > GeneratedSentencesFromWHPYLM = LanguageModel.Generate("WHPYLM", 10000, SentEndWordId, NULL); for (auto &Sentence : GeneratedSentencesFromWHPYLM) { for (auto &Word : Sentence) { if (Word == SentEndWordId) { std::cout << std::endl; } else { // std::cout << Id2CharacterSequenceVector.at(Word) << "[" << Word << "] "; std::cout << Id2WordSequenceVector.at(Word) << " "; } } std::cout << std::endl; } }
void DebugLib::PrintLanguageModelStats(const NHPYLM &LanguageModel) { if(LanguageModel.GetCHPYLMOrder() > 0) { std::cout << std::setprecision(2) << " CHPYLM statistics:"; PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("CHPYLM", "Context"), 8, "\n Contexts: ", ""); PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("CHPYLM", "Table"), 8, "\n Tables: ", ""); PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("CHPYLM", "Word"), 8, "\n Characters: ", ""); PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().CHPYLMConcentration, 8, "\n Concentration: ", ""); PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().CHPYLMDiscount, 8, "\n Discount: ", ""); std::cout << "\n"; } if(LanguageModel.GetWHPYLMOrder() > 0) { std::cout << " WHPYLM statistics:"; PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("WHPYLM", "Context"), 8, "\n Contexts: ", ""); PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("WHPYLM", "Table"), 8, "\n Tables: ", ""); PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("WHPYLM", "Word"), 8, "\n Words: ", ""); PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().WHPYLMConcentration, 8, "\n Concentration: ", ""); PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().WHPYLMDiscount, 8, "\n Discount: ", ""); std::cout << "\n"; } std::cout << std::endl; }
void DebugLib::GenerateSentencesOfWordsFromCharLM( const NHPYLM &LanguageModel) { // Id2 Character sequence vector for printing std::vector<std::string> Id2CharacterSequenceVector( LanguageModel.GetId2CharacterSequenceVector()); // generate sentences of words from the Character language model std::vector<std::vector<int> > GeneratedSentencesFromCHPYLM = LanguageModel.Generate("CHPYLM", 100000, -1, NULL); for (auto &Sentence : GeneratedSentencesFromCHPYLM) { for (auto &Character : Sentence) { if (Character == UNKEND_SYMBOLID) { std::cout << " "; } else if (Character == SENTEND_SYMBOLID) { std::cout << std::endl; } else { // std::cout << Id2CharacterSequenceVector.at(Character) << "[" << Character << "] "; std::cout << Id2CharacterSequenceVector.at(Character); } } std::cout << std::endl; } }
void DebugLib::PrintTransitions(const ContextToContextTransitions &Transitions, unsigned int CurrentContextId, std::vector< bool > &VisitedContextIds, const NHPYLM &LanguageModel, int SentEndWordId) { if (VisitedContextIds.size() <= CurrentContextId) { VisitedContextIds.resize(CurrentContextId + 1, false); } VisitedContextIds[CurrentContextId] = true; std::cout << "Transition from " << CurrentContextId << ":"; for (unsigned int TransitionIndex = 0; TransitionIndex < Transitions.Words.size(); TransitionIndex++) { std::cout << " [" << Transitions.Words.at(TransitionIndex) << "," << Transitions.NextContextIds.at(TransitionIndex) << "," << Transitions.Probabilities.at(TransitionIndex) << "]"; } std::cout << "Has transition to WordEnd: " << Transitions.HasTransitionToSentEnd << std::endl; for (unsigned int TransitionIndex = 0; TransitionIndex < Transitions.Words.size(); TransitionIndex++) { if ((static_cast<int>(VisitedContextIds.size()) <= Transitions.NextContextIds.at(TransitionIndex)) || !VisitedContextIds[Transitions.NextContextIds.at(TransitionIndex)]) { PrintTransitions(LanguageModel.GetTransitions(Transitions.NextContextIds.at(TransitionIndex), SentEndWordId, std::vector<bool>(LanguageModel.GetMaxNumWords())), Transitions.NextContextIds.at(TransitionIndex), VisitedContextIds, LanguageModel, SentEndWordId); } } }
void LatticeWordSegmentation::SwitchLanguageModelOrders( int NewUnkN, int NewKnownN, int NewAddCharN ) { std::cout << " Switching to KnownN=" << NewKnownN << ", UnkN=" << NewUnkN; if (NewAddCharN > 0) { std::cout << ", AddCharN=" << NewAddCharN; } std::cout << std::endl; // save previous language model and delete additional character language model delete CharacterLanguageModel; CharacterLanguageModel = nullptr; NHPYLM *OldLanguageModel = LanguageModel; std::size_t OldWHPYLMContextLength = WHPYLMContextLength; // instantiate new language model and initialize InitializeLanguageModel(NewUnkN, NewKnownN, NewAddCharN); // parse words in sampled sentences and add to new dictionary // and language model. Also update sentences with new word ids for (auto& Sentence : SampledSentences) { std::size_t TempSampledSentenceSize = Sentence.size() - OldWHPYLMContextLength + WHPYLMContextLength; std::vector<int> TempSampledSentence; TempSampledSentence.reserve(TempSampledSentenceSize); TempSampledSentence.assign(WHPYLMContextLength, SentEndWordId); for (auto Id = Sentence.begin() + OldWHPYLMContextLength; Id != Sentence.end(); ++Id) { WordBeginLengthPair Word = OldLanguageModel->GetWordBeginLength(*Id); TempSampledSentence.push_back( LanguageModel->AddCharacterIdSequenceToDictionary( Word.first, Word.second).first ); } Sentence = TempSampledSentence; LanguageModel->AddWordSequenceToLm(Sentence); if (CharacterLanguageModel != nullptr) { ParseLib::AddWordSequenceToAddCharLM( Sentence.begin() + WHPYLMContextLength, Sentence.size() - WHPYLMContextLength, *LanguageModel, CharacterLanguageModel ); } } // retrain language model for some iterations TrainLanguageModel(SampledSentences, Params.NewLMNumIters); // calculate and update word length statistics WordLengthProbCalculator::UpdateWHPYLMBaseProbabilitiesScale( LanguageModel, Params.WordLengthModulation ); // resample hyperparameters of language model LanguageModel->ResampleHyperParameters(); // cleanup: delete old language model delete OldLanguageModel; }