void NgramModel::_LoadTopicProbs2(vector<DoubleVector> &topicProbVectors, ZFile &hmmldaFile, size_t maxSize) const { assert(maxSize <= size()); vector<CountVector> countVectors(maxSize); topicProbVectors.resize(maxSize); for (size_t o = 0; o < maxSize; ++o) { countVectors[o].resize(sizes(o), 0); topicProbVectors[o].resize(sizes(o), 0); } // Accumulate counts for each n-gram in words. size_t numSentenceWords = 1; IndexVector hists(maxSize, 0); char line[MAXLINE]; char wordStr[1024]; VocabIndex word; int state, topic; while (hmmldaFile.getLine( line, MAXLINE)) { if (line[0] == '#') continue; // Skip comment lines. int numItems = sscanf(line, "%s\t%d\t%d\n", wordStr, &state, &topic); if (numItems != 3) throw std::invalid_argument("Bad format"); word = _vocab.Find(wordStr); numSentenceWords++; NgramIndex hist = 0, index; for (size_t j = 1; j <= std::min(numSentenceWords, maxSize - 1); j++) { index = _vectors[j].Find(hist, word); if (index == -1) { printf("Feature skipped\n"); } else { countVectors[j-1][hist]++; if (state == 1) topicProbVectors[j-1][hist]++; } hist = hists[j]; hists[j] = index; } if (word == Vocab::EndOfSentence) numSentenceWords = 1; } // Finalize probability computation. for (size_t o = 0; o < maxSize; o++) { for (size_t i = 0; i < countVectors[o].length(); i++) { if (countVectors[o][i] == 0) topicProbVectors[o][i] = 0.0; else topicProbVectors[o][i] /= countVectors[o][i]; } } }
void NgramModel::LoadFeatures(vector<DoubleVector> &featureVectors, ZFile &featureFile, size_t maxOrder) const { if (featureFile == NULL) throw std::invalid_argument("Invalid file"); // Allocate space for feature vectors. if (maxOrder == 0 || maxOrder > size() - 1) maxOrder = size() - 1; featureVectors.resize(maxOrder + 1); for (size_t i = 0; i <= maxOrder; i++) featureVectors[i].reset(sizes(i), 0); // Initialize to 0. // Load feature value for each n-gram in feature file. char line[MAXLINE]; vector<VocabIndex> words(256); while (featureFile.getLine(line, MAXLINE)) { if (line[0] == '\0' || line[0] == '#') continue; words.clear(); char *p = &line[0]; while (true) { const char *token = p; while (*p != 0 && !isspace(*p)) ++p; if (*p == 0) { // Last token in line: Set feature value. NgramIndex index = 0; for (size_t i = 1; i <= words.size(); i++) index = _vectors[i].Find(index, words[i - 1]); if (index == -1) Logger::Warn(1, "Feature skipped.\n"); else featureVectors[words.size()][index] = atof(token); break; // Move to next line. } // Not the last token: Lookup word index and add to words. size_t len = p - token; *p++ = 0; if (len > 0) words.push_back(_vocab.Find(token, len)); if (words.size() > maxOrder) break; } } }
double PerplexityOptimizer::ShortCorpusComputeEntropy(ZFile &corpusFile, const ParamVector ¶ms) { if (corpusFile == NULL) throw std::invalid_argument("Invalid file"); size_t size = _lm._pModel->size(); // BitVector vocabMask(size, 1); // Accumulate counts of prob/bow for computing perplexity of corpusFilename. char line[MAXLINE]; size_t numOOV = 0; vector<VocabIndex> words(256); _totLogProb = 0.0; _numZeroProbs = 0; _numWords = 0; while (corpusFile.getLine( line, MAXLINE)) { if (strncmp(line, "<DOC ", 5) == 0 || strcmp(line, "</DOC>") == 0) continue; // Logger::Log(0, "Additional Input:%s\n", line); // Lookup vocabulary indices for each word in the line. words.clear(); // words.push_back(Vocab::EndOfSentence); char *p = &line[0]; while (*p != 0) { while (isspace(*p)) ++p; // Skip consecutive spaces. const char *token = p; while (*p != 0 && !isspace(*p)) ++p; size_t len = p - token; if (*p != 0) *p++ = 0; words.push_back(_lm.vocab().Find(token, len)); } // words.push_back(Vocab::EndOfSentence); // Add each top order n-gram. size_t ngramOrder = std::min((size_t)2, size - 1); for (size_t i = 1; i < words.size(); i++) { if (words[i] == Vocab::Invalid) { // OOV word encountered. Reset order to unigrams. ngramOrder = 1; numOOV++; } else { NgramIndex index; size_t boOrder = ngramOrder; while ((index = _lm._pModel->_Find(&words[i-boOrder+1], boOrder)) == -1) { --boOrder; NgramIndex hist = _lm._pModel->_Find(&words[i - boOrder], boOrder); if (hist != (NgramIndex)-1) { if ((_lm.bows(boOrder))[hist] != 0) { // _bowCountVectors[boOrder][hist]++; _totLogProb += log((_lm.bows(boOrder))[hist]) * 1; } } } ngramOrder = std::min(ngramOrder + 1, size - 1); // _probCountVectors[boOrder][index]++; if ((_lm.probs(boOrder))[index] == 0) _numZeroProbs++; else _totLogProb += log((_lm.probs(boOrder))[index]) * 1; _numWords++; } } } double entropy = -_totLogProb / (_numWords - _numZeroProbs); // std::cout << -_totLogProb << "\t" << _numWords << "\t" << _numZeroProbs << std::endl; if (Logger::GetVerbosity() > 2) std::cout << exp(entropy) << "\t" << params << std::endl; else Logger::Log(2, "%f\n", exp(entropy)); return std::isnan(entropy) ? 70 : entropy; }
void NgramModel::_LoadEntropy(vector<DoubleVector> &entropyVectors, ZFile &corpusFile, size_t maxSize) const { if (corpusFile == NULL) throw std::invalid_argument("Invalid file"); // Resize vectors and allocate counts. if (maxSize == 0 || maxSize > size()) maxSize = size(); int numDocs = 0; vector<CountVector> countVectors(maxSize); vector<CountVector> totCountVectors(maxSize); entropyVectors.resize(maxSize); for (size_t o = 0; o < maxSize; ++o) { countVectors[o].resize(sizes(o), 0); totCountVectors[o].resize(sizes(o), 0); entropyVectors[o].resize(sizes(o), 0); } // Accumulate counts for each n-gram in corpus file. char line[MAXLINE]; vector<VocabIndex> words(256); vector<NgramIndex> hists(maxSize); while (corpusFile.getLine( line, MAXLINE)) { if (strcmp(line, "</DOC>") == 0) { // Accumulate frequency. numDocs++; for (size_t o = 1; o < countVectors.size(); ++o) { for (size_t i = 0; i < countVectors[o].length(); ++i) { int c = countVectors[o][i]; if (c > 0) { totCountVectors[o][i] += c; entropyVectors[o][i] += c * log(c); countVectors[o][i] = 0; } } } continue; } else if (strncmp(line, "<DOC ", 5) == 0) continue; // Lookup vocabulary indices for each word in the line. words.clear(); words.push_back(Vocab::EndOfSentence); char *p = &line[0]; while (*p != '\0') { while (isspace(*p)) ++p; // Skip consecutive spaces. const char *token = p; while (*p != 0 && !isspace(*p)) ++p; size_t len = p - token; if (*p != 0) *p++ = 0; words.push_back(_vocab.Find(token, len)); } words.push_back(Vocab::EndOfSentence); // Add each order n-gram. hists[1] = _vectors[1].Find(0, Vocab::EndOfSentence); for (size_t i = 1; i < words.size(); ++i) { VocabIndex word = words[i]; NgramIndex hist = 0; for (size_t j = 1; j < std::min(i + 2, maxSize); ++j) { if (word != Vocab::Invalid) { NgramIndex index = _vectors[j].Find(hist, word); if (index >= 0) countVectors[j][index]++; else Logger::Warn(1, "DocFreq feature skipped.\n"); hist = hists[j]; hists[j] = index; } else { hist = hists[j]; hists[j] = NgramVector::Invalid; } } } } // Finalize entropy computation. double invLogNumDocs = 1.0 / log((double)numDocs); for (size_t o = 1; o < maxSize; o++) entropyVectors[o] = CondExpr( totCountVectors[o] == 0, 0.0, ((entropyVectors[o] / -totCountVectors[o]) + log(asDouble(totCountVectors[o]))) * invLogNumDocs); }
void NgramModel::LoadCorpus(vector<CountVector> &countVectors, ZFile &corpusFile, bool reset) { if (corpusFile == NULL) throw std::invalid_argument("Invalid file (corpusFile is NULL)"); // Resize vectors and allocate counts. countVectors.resize(size()); countVectors[0].resize(1, 0); for (size_t o = 1; o < size(); ++o) { size_t capacity = std::max(1ul<<16, nextPowerOf2(_vectors[o].size())); if (reset) countVectors[o].reset(capacity, 0); else countVectors[o].resize(capacity, 0); } // Accumulate counts for each n-gram in corpus file. char line[MAXLINE]; vector<VocabIndex> words(256); vector<NgramIndex> hists(size(), -1); while (corpusFile.getLine( line, MAXLINE)) { // Lookup vocabulary indices for each word in the line. words.clear(); words.push_back(Vocab::EndOfSentence); char *p = &line[0]; while (*p != '\0') { while (isspace(*p)) ++p; // Skip consecutive spaces. const char *token = p; while (*p != 0 && !isspace(*p)) ++p; size_t len = p - token; if (*p != 0) *p++ = 0; words.push_back(_vocab.Add(token, len)); } words.push_back(Vocab::EndOfSentence); // Add each order n-gram. hists[1] = _vectors[1].Add(0, Vocab::EndOfSentence); for (size_t i = 1; i < words.size(); ++i) { VocabIndex word = words[i]; NgramIndex hist = 0; for (size_t j = 1; j < std::min(i + 2, size()); ++j) { if (word != Vocab::Invalid && hist != NgramVector::Invalid) { bool newNgram; NgramIndex index = _vectors[j].Add(hist, word, &newNgram); if (newNgram && (size_t)index >= countVectors[j].length()) countVectors[j].resize(countVectors[j].length() * 2, 0); countVectors[j][index]++; hist = hists[j]; hists[j] = index; } else { hist = hists[j]; hists[j] = NgramVector::Invalid; } } } } // Add remaining vocabulary, if necessary. if (_vectors[1].size() != _vocab.size()) { for (VocabIndex i = 0; i < (VocabIndex)_vocab.size(); ++i) _vectors[1].Add(0, i); countVectors[1].resize(_vocab.size(), 0); } // Sort and resize counts to actual size. VocabVector vocabMap; IndexVector ngramMap(1, 0), boNgramMap; _vocab.Sort(vocabMap); for (size_t o = 0; o < size(); ++o) { boNgramMap.swap(ngramMap); if (_vectors[o].Sort(vocabMap, boNgramMap, ngramMap)) NgramModel::ApplySort(ngramMap, countVectors[o]); else countVectors[o].resize(ngramMap.length()); } _ComputeBackoffs(); }
void NgramModel::LoadEvalCorpus(vector<CountVector> &probCountVectors, vector<CountVector> &bowCountVectors, BitVector &vocabMask, ZFile &corpusFile, size_t &outNumOOV, size_t &outNumWords) const { if (corpusFile == NULL) throw std::invalid_argument("Invalid file"); // Allocate count vectors. probCountVectors.resize(size()); bowCountVectors.resize(size() - 1); for (size_t i = 0; i < size(); i++) probCountVectors[i].reset(_vectors[i].size(), 0); for (size_t i = 0; i < size() - 1; i++) bowCountVectors[i].reset(_vectors[i].size(), 0); // Accumulate counts of prob/bow for computing perplexity of corpusFilename. char line[MAXLINE]; size_t numOOV = 0; size_t numWords = 0; vector<VocabIndex> words(256); while (corpusFile.getLine( line, MAXLINE)) { if (strncmp(line, "<DOC ", 5) == 0 || strcmp(line, "</DOC>") == 0) continue; // Logger::Log(0, "Additional Input:%s\n", line); // Lookup vocabulary indices for each word in the line. words.clear(); // words.push_back(Vocab::EndOfSentence); char *p = &line[0]; while (*p != 0) { while (isspace(*p)) ++p; // Skip consecutive spaces. const char *token = p; while (*p != 0 && !isspace(*p)) ++p; size_t len = p - token; if (*p != 0) *p++ = 0; words.push_back(_vocab.Find(token, len)); } // words.push_back(Vocab::EndOfSentence); // Add each top order n-gram. size_t ngramOrder = std::min((size_t)2, size() - 1); for (size_t i = 1; i < words.size(); i++) { if (words[i] == Vocab::Invalid || !vocabMask[words[i]]) { // OOV word encountered. Reset order to unigrams. ngramOrder = 1; numOOV++; } else { NgramIndex index; size_t boOrder = ngramOrder; while ((index = _Find(&words[i-boOrder+1], boOrder)) == -1) { --boOrder; NgramIndex hist = _Find(&words[i - boOrder], boOrder); if (hist != (NgramIndex)-1) bowCountVectors[boOrder][hist]++; } ngramOrder = std::min(ngramOrder + 1, size() - 1); probCountVectors[boOrder][index]++; numWords++; } } } outNumOOV = numOOV; outNumWords = numWords; }
void NgramModel::LoadLM(vector<ProbVector> &probVectors, vector<ProbVector> &bowVectors, ZFile &lmFile) { if (lmFile == NULL) throw std::invalid_argument("Invalid file"); // Read ARPA LM header. char line[MAXLINE]; size_t o, len; vector<size_t> ngramLengths(1); while (lmFile.getLine( line, MAXLINE) && strcmp(line, "\\data\\") != 0) /* NOP */; while (lmFile.getLine( line, MAXLINE)) { unsigned int o, len; if (sscanf(line, "ngram %u=%u", &o, &len) != 2) break; assert(o == ngramLengths.size()); ngramLengths.push_back(len); } // Allocate buffers and read counts. _vocab.Reserve(ngramLengths[1]); probVectors.resize(size()); probVectors[0].resize(1, 0.0); bowVectors.resize(size() - 1); bowVectors[0].resize(1, 0.0); for (o = 1; o < size(); o++) { ProbVector &probs = probVectors[o]; ProbVector &bows = bowVectors[o]; bool hasBow = (o < size() - 1); // Preallocate buffer for n-grams. _vectors[o].Reserve(ngramLengths[o]); probs.reset(ngramLengths[o]); if (hasBow) bows.reset(ngramLengths[o]); lmFile.getLine( line, MAXLINE); unsigned int i; if (sscanf(line, "\\%u-ngrams:", &i) != 1 || i != o) { throw std::invalid_argument("Unexpected file format."); } while (true) { lmFile.getLine( line, MAXLINE); size_t lineLen = strlen(line); if (line[0] == '\0') break; // Empty line ends section. char *p = &line[0]; // Read log probability. Prob prob = (Prob)pow(10.0, strtod(p, &p)); p++; // Read i words. NgramIndex index = 0; const char *token = NULL; for (i = 1; i <= o; ++i) { token = p; while (*p != 0 && !isspace(*p)) ++p; len = p - token; *p++ = 0; VocabIndex vocabIndex = _vocab.Add(token, len); if (vocabIndex == Vocab::Invalid) { index = NgramVector::Invalid; break; } index = _vectors[i].Add(index, vocabIndex); } if (index == NgramVector::Invalid) break; // Set probability and backoff weight. if (index == Vocab::EndOfSentence && o == 1) { if (strcmp(token, "<s>") == 0) { assert(prob <= pow(10, -99)); bows[index] = (p >= &line[lineLen]) ? (Prob)1 : (Prob)pow(10.0, strtod(p, &p)); } else { probs[index] = prob; assert(p >= &line[lineLen]); } } else { probs[index] = prob; if (hasBow) { // Read optional backoff weight. bows[index] = (p >= &line[lineLen]) ? (Prob)1 : (Prob)pow(10.0, strtod(p, &p)); } } } } // Read ARPA LM footer. while (lmFile.getLine( line, MAXLINE) && strcmp(line, "\\end\\") != 0) /* NOP */; // Sort and resize probs/bows to actual size. VocabVector vocabMap; IndexVector ngramMap(1, 0), boNgramMap; _vocab.Sort(vocabMap); for (size_t o = 0; o < size(); ++o) { boNgramMap.swap(ngramMap); if (_vectors[o].Sort(vocabMap, boNgramMap, ngramMap)) { NgramModel::ApplySort(ngramMap, probVectors[o]); if (o < bowVectors.size()) NgramModel::ApplySort(ngramMap, bowVectors[o]); } else { probVectors[o].resize(ngramMap.length()); if (o < bowVectors.size()) bowVectors[o].resize(ngramMap.length()); } } _ComputeBackoffs(); }
void NgramModel::LoadCounts(vector<CountVector> &countVectors, ZFile &countsFile, bool reset) { if (countsFile == NULL) throw std::invalid_argument("Invalid file"); // Resize vectors and allocate counts. countVectors.resize(size()); countVectors[0].resize(1, 0); for (size_t o = 1; o < size(); ++o) { size_t capacity = std::max(1ul<<16, nextPowerOf2(_vectors[o].size())); if (reset) countVectors[o].reset(capacity, 0); else countVectors[o].resize(capacity, 0); } // Accumulate counts for each n-gram in counts file. char line[MAXLINE]; vector<VocabIndex> words(256); while (countsFile.getLine( line, MAXLINE)) { if (line[0] == '\0' || line[0] == '#') continue; words.clear(); char *p = &line[0]; while (words.size() < size()) { const char *token = p; while (*p != 0 && !isspace(*p)) ++p; if (*p == 0) { // Last token in line: Add ngram with count bool newNgram; size_t order = words.size(); NgramIndex index = 0; for (size_t i = 1; i < order; ++i) index = _vectors[i].Add(index, words[i - 1]); index = _vectors[order].Add(index, words[order - 1], &newNgram); if (newNgram && (size_t)index >= countVectors[order].length()) countVectors[order].resize(countVectors[order].length() * 2, 0); countVectors[order][index] += atoi(token); break; // Move to next line. } // Not the last token: Lookup word index and add to words. size_t len = p - token; *p++ = 0; if (len > 0) { VocabIndex vocabIndex = _vocab.Add(token, len); if (vocabIndex == Vocab::Invalid) break; words.push_back(vocabIndex); } } } // Sort and resize counts to actual size. VocabVector vocabMap; IndexVector ngramMap(1, 0), boNgramMap; _vocab.Sort(vocabMap); for (size_t o = 0; o < size(); ++o) { boNgramMap.swap(ngramMap); if (_vectors[o].Sort(vocabMap, boNgramMap, ngramMap)) NgramModel::ApplySort(ngramMap, countVectors[o]); else countVectors[o].resize(ngramMap.length()); } _ComputeBackoffs(); }