Ejemplo n.º 1
0
void DebugLib::PrintSentencesPerplexity(const std::vector<std::vector<int> > &Sentences, const NHPYLM &LanguageModel)
{
  int WHPYLMContextLenght = LanguageModel.GetWHPYLMOrder() - 1;
  double LoglikelihoodSum = 0;
  int NumWords = 0;
  for (const std::vector<int> &Sentence : Sentences) {
    LoglikelihoodSum += LanguageModel.WordSequenceLoglikelihood(Sentence);
    NumWords += Sentence.size() - WHPYLMContextLenght;
  }
  std::cout << std::setprecision(2) << std::fixed << " Perplexity: " << exp(-LoglikelihoodSum / NumWords) << std::endl << std::endl;
}
Ejemplo n.º 2
0
NHPYLMFst::NHPYLMFst(const NHPYLM &LanguageModel_, int SentEndWordId_, const vector< bool > &ActiveWords_) :
  LanguageModel(LanguageModel_),
  SentEndWordId(SentEndWordId_),
  CHPYLMOrder(LanguageModel_.GetCHPYLMOrder()),
  WHPYLMOrder(LanguageModel_.GetWHPYLMOrder()),
  StartContextId(LanguageModel_.GetContextId(std::vector<int>(WHPYLMOrder - 1, SentEndWordId_))),
  FinalContextId(LanguageModel_.GetFinalContextId()),
  FSTProperties(fst::kOEpsilons | fst::kILabelSorted | fst::kOLabelSorted),
  FSTType("vector"),
  ActiveWords(ActiveWords_),
  FallbackSymbolId(PHI_SYMBOLID),
  Arcs(LanguageModel_.GetFinalContextId() + 1)
{
}
Ejemplo n.º 3
0
void DebugLib::GenerateSentencesOfWordsFromWordLM(
  const NHPYLM &LanguageModel, int SentEndWordId)
{
  // Id2 word sequence vector for printing
  std::vector<std::string> Id2WordSequenceVector(LanguageModel.GetId2CharacterSequenceVector());

  // generate sentences of words from the word language model
  std::vector<std::vector<int> > GeneratedSentencesFromWHPYLM = LanguageModel.Generate("WHPYLM", 10000, SentEndWordId, NULL);
  for (auto &Sentence : GeneratedSentencesFromWHPYLM) {
    for (auto &Word : Sentence) {
      if (Word == SentEndWordId) {
        std::cout << std::endl;
      } else {
//         std::cout << Id2CharacterSequenceVector.at(Word) << "[" << Word << "] ";
        std::cout << Id2WordSequenceVector.at(Word) << " ";
      }
    }
    std::cout << std::endl;
  }
}
Ejemplo n.º 4
0
void DebugLib::PrintLanguageModelStats(const NHPYLM &LanguageModel)
{
  if(LanguageModel.GetCHPYLMOrder() > 0) {
    std::cout << std::setprecision(2) << " CHPYLM statistics:";
    PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("CHPYLM", "Context"), 8, "\n  Contexts:      ", "");
    PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("CHPYLM", "Table"),   8, "\n  Tables:        ", "");
    PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("CHPYLM", "Word"),    8, "\n  Characters:    ", "");
    PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().CHPYLMConcentration,  8, "\n  Concentration: ", "");
    PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().CHPYLMDiscount,       8, "\n  Discount:      ", "");
    std::cout << "\n";
  }
  if(LanguageModel.GetWHPYLMOrder() > 0) {
    std::cout << " WHPYLM statistics:";
    PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("WHPYLM", "Context"), 8, "\n  Contexts:      ", "");
    PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("WHPYLM", "Table"),   8, "\n  Tables:        ", "");
    PrintVectorOfInts(LanguageModel.GetTotalCountPerLevelFor("WHPYLM", "Word"),    8, "\n  Words:         ", "");
    PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().WHPYLMConcentration,  8, "\n  Concentration: ", "");
    PrintVectorOfDoubles(LanguageModel.GetNHPYLMParameters().WHPYLMDiscount,       8, "\n  Discount:      ", "");
    std::cout << "\n";
  }
  std::cout << std::endl;
}
Ejemplo n.º 5
0
void DebugLib::GenerateSentencesOfWordsFromCharLM(
  const NHPYLM &LanguageModel)
{
  // Id2 Character sequence vector for printing
  std::vector<std::string> Id2CharacterSequenceVector(
    LanguageModel.GetId2CharacterSequenceVector());

  // generate sentences of words from the Character language model
  std::vector<std::vector<int> > GeneratedSentencesFromCHPYLM = LanguageModel.Generate("CHPYLM", 100000, -1, NULL);
  for (auto &Sentence : GeneratedSentencesFromCHPYLM) {
    for (auto &Character : Sentence) {
      if (Character == UNKEND_SYMBOLID) {
        std::cout << " ";
      } else if (Character == SENTEND_SYMBOLID) {
        std::cout << std::endl;
      } else {
//         std::cout << Id2CharacterSequenceVector.at(Character) << "[" << Character << "] ";
        std::cout << Id2CharacterSequenceVector.at(Character);
      }
    }
    std::cout << std::endl;
  }
}
Ejemplo n.º 6
0
void DebugLib::PrintTransitions(const ContextToContextTransitions &Transitions, unsigned int CurrentContextId, std::vector< bool > &VisitedContextIds, const NHPYLM &LanguageModel, int SentEndWordId)
{
  if (VisitedContextIds.size() <= CurrentContextId) {
    VisitedContextIds.resize(CurrentContextId + 1, false);
  }
  VisitedContextIds[CurrentContextId] = true;

  std::cout << "Transition from " << CurrentContextId << ":";
  for (unsigned int TransitionIndex = 0; TransitionIndex < Transitions.Words.size(); TransitionIndex++) {
    std::cout << " [" << Transitions.Words.at(TransitionIndex) << "," << Transitions.NextContextIds.at(TransitionIndex) << "," << Transitions.Probabilities.at(TransitionIndex) << "]";
  }
  std::cout << "Has transition to WordEnd: " << Transitions.HasTransitionToSentEnd << std::endl;
  for (unsigned int TransitionIndex = 0; TransitionIndex < Transitions.Words.size(); TransitionIndex++) {
    if ((static_cast<int>(VisitedContextIds.size()) <= Transitions.NextContextIds.at(TransitionIndex)) || !VisitedContextIds[Transitions.NextContextIds.at(TransitionIndex)]) {
      PrintTransitions(LanguageModel.GetTransitions(Transitions.NextContextIds.at(TransitionIndex), SentEndWordId, std::vector<bool>(LanguageModel.GetMaxNumWords())), Transitions.NextContextIds.at(TransitionIndex), VisitedContextIds, LanguageModel, SentEndWordId);
    }
  }
}
void LatticeWordSegmentation::SwitchLanguageModelOrders(
  int NewUnkN,
  int NewKnownN,
  int NewAddCharN
)
{
  std::cout << " Switching to KnownN=" << NewKnownN
            << ", UnkN=" << NewUnkN;
  if (NewAddCharN > 0) {
    std::cout << ", AddCharN=" << NewAddCharN;
  }
  std::cout << std::endl;

  // save previous language model and delete additional character language model
  delete CharacterLanguageModel;
  CharacterLanguageModel = nullptr;
  NHPYLM *OldLanguageModel = LanguageModel;
  std::size_t OldWHPYLMContextLength = WHPYLMContextLength;

  // instantiate new language model and initialize
  InitializeLanguageModel(NewUnkN, NewKnownN, NewAddCharN);

  // parse words in sampled sentences and add to new dictionary
  // and language model. Also update sentences with new word ids
  for (auto& Sentence : SampledSentences) {
    std::size_t TempSampledSentenceSize =
      Sentence.size() - OldWHPYLMContextLength + WHPYLMContextLength;
    std::vector<int> TempSampledSentence;
    TempSampledSentence.reserve(TempSampledSentenceSize);
    TempSampledSentence.assign(WHPYLMContextLength, SentEndWordId);
    for (auto Id = Sentence.begin() + OldWHPYLMContextLength;
         Id != Sentence.end(); ++Id) {
      WordBeginLengthPair Word = OldLanguageModel->GetWordBeginLength(*Id);
      TempSampledSentence.push_back(
        LanguageModel->AddCharacterIdSequenceToDictionary(
          Word.first, Word.second).first
      );
    }
    Sentence = TempSampledSentence;

    LanguageModel->AddWordSequenceToLm(Sentence);
    if (CharacterLanguageModel != nullptr) {
      ParseLib::AddWordSequenceToAddCharLM(
        Sentence.begin() + WHPYLMContextLength,
        Sentence.size() - WHPYLMContextLength,
        *LanguageModel,
        CharacterLanguageModel
      );
    }
  }

  // retrain language model for some iterations
  TrainLanguageModel(SampledSentences, Params.NewLMNumIters);

  // calculate and update word length statistics
  WordLengthProbCalculator::UpdateWHPYLMBaseProbabilitiesScale(
    LanguageModel,
    Params.WordLengthModulation
  );

  // resample hyperparameters of language model
  LanguageModel->ResampleHyperParameters();

  // cleanup: delete old language model
  delete OldLanguageModel;
}