コード例 #1
0
void 
NgramModel::_LoadTopicProbs2(vector<DoubleVector> &topicProbVectors,
                            ZFile &hmmldaFile, size_t maxSize) const {
    assert(maxSize <= size());

    vector<CountVector> countVectors(maxSize);
    topicProbVectors.resize(maxSize);
    for (size_t o = 0; o < maxSize; ++o) {
        countVectors[o].resize(sizes(o), 0);
        topicProbVectors[o].resize(sizes(o), 0);
    }

    // Accumulate counts for each n-gram in words.
    size_t      numSentenceWords = 1;
    IndexVector hists(maxSize, 0);
    char        line[MAXLINE];
    char        wordStr[1024];
    VocabIndex  word;
    int         state, topic;
    while (hmmldaFile.getLine( line, MAXLINE)) {
        if (line[0] == '#') continue;  // Skip comment lines.
        int numItems = sscanf(line, "%s\t%d\t%d\n", wordStr, &state, &topic);
        if (numItems != 3) throw std::invalid_argument("Bad format");
        word = _vocab.Find(wordStr);
        numSentenceWords++;

        NgramIndex hist = 0, index;
        for (size_t j = 1; j <= std::min(numSentenceWords, maxSize - 1); j++) {
            index = _vectors[j].Find(hist, word);
            if (index == -1) {
                printf("Feature skipped\n");
            } else {
                countVectors[j-1][hist]++;
                if (state == 1)
                    topicProbVectors[j-1][hist]++;
            }
            hist     = hists[j];
            hists[j] = index;
        }
        if (word == Vocab::EndOfSentence)
            numSentenceWords = 1;
    }

    // Finalize probability computation.
    for (size_t o = 0; o < maxSize; o++) {
        for (size_t i = 0; i < countVectors[o].length(); i++) {
            if (countVectors[o][i] == 0)
                topicProbVectors[o][i] = 0.0;
            else
                topicProbVectors[o][i] /= countVectors[o][i];
        }
    }    
}
コード例 #2
0
void
NgramModel::LoadFeatures(vector<DoubleVector> &featureVectors,
                         ZFile &featureFile, size_t maxOrder) const {
    if (featureFile == NULL) throw std::invalid_argument("Invalid file");

    // Allocate space for feature vectors.
    if (maxOrder == 0 || maxOrder > size() - 1)
        maxOrder = size() - 1;
    featureVectors.resize(maxOrder + 1);
    for (size_t i = 0; i <= maxOrder; i++)
        featureVectors[i].reset(sizes(i), 0);  // Initialize to 0.

    // Load feature value for each n-gram in feature file.
    char                    line[MAXLINE];
    vector<VocabIndex> words(256);
    while (featureFile.getLine(line, MAXLINE)) {
        if (line[0] == '\0' || line[0] == '#') continue;
        words.clear();
        char *p = &line[0];
        while (true) {
            const char *token = p;
            while (*p != 0 && !isspace(*p))  ++p;
            if (*p == 0)
            {
                // Last token in line: Set feature value.
                NgramIndex index = 0;
                for (size_t i = 1; i <= words.size(); i++)
                    index = _vectors[i].Find(index, words[i - 1]);
                if (index == -1)
                    Logger::Warn(1, "Feature skipped.\n");
                else
                    featureVectors[words.size()][index] = atof(token);
                break;  // Move to next line.
            }

            // Not the last token: Lookup word index and add to words.
            size_t len = p - token;
            *p++ = 0;
            if (len > 0) words.push_back(_vocab.Find(token, len));
            if (words.size() > maxOrder) break;
        }
    }
}
コード例 #3
0
double
PerplexityOptimizer::ShortCorpusComputeEntropy(ZFile &corpusFile, const ParamVector &params) {
    if (corpusFile == NULL) throw std::invalid_argument("Invalid file");
    size_t size = _lm._pModel->size();
//     BitVector vocabMask(size, 1);
    // Accumulate counts of prob/bow for computing perplexity of corpusFilename.
    char                    line[MAXLINE];
    size_t                  numOOV = 0;
    vector<VocabIndex> words(256);
    _totLogProb = 0.0;
    _numZeroProbs = 0;
    _numWords = 0;
    while (corpusFile.getLine( line, MAXLINE)) {
        if (strncmp(line, "<DOC ", 5) == 0 || strcmp(line, "</DOC>") == 0)
            continue;
//      Logger::Log(0, "Additional Input:%s\n", line);
        // Lookup vocabulary indices for each word in the line.
        words.clear();
//         words.push_back(Vocab::EndOfSentence);
        char *p = &line[0];
        while (*p != 0) {
            while (isspace(*p)) ++p;  // Skip consecutive spaces.
            const char *token = p;
            while (*p != 0 && !isspace(*p))  ++p;
            size_t len = p - token;
            if (*p != 0) *p++ = 0;
            words.push_back(_lm.vocab().Find(token, len));
        }
//         words.push_back(Vocab::EndOfSentence);

        // Add each top order n-gram.
        size_t ngramOrder = std::min((size_t)2, size - 1);
        for (size_t i = 1; i < words.size(); i++) {
            if (words[i] == Vocab::Invalid) {
                // OOV word encountered.  Reset order to unigrams.
                ngramOrder = 1;
                numOOV++;
            } else {
                NgramIndex index;
                size_t     boOrder = ngramOrder;
                while ((index = _lm._pModel->_Find(&words[i-boOrder+1], boOrder)) == -1) {
                    --boOrder;
                    NgramIndex hist = _lm._pModel->_Find(&words[i - boOrder], boOrder);
                    if (hist != (NgramIndex)-1) {
                        if ((_lm.bows(boOrder))[hist] != 0) {
//                         _bowCountVectors[boOrder][hist]++;
                          _totLogProb += log((_lm.bows(boOrder))[hist]) * 1;
                        }
                    }
                }
                ngramOrder = std::min(ngramOrder + 1, size - 1);
//                 _probCountVectors[boOrder][index]++;
                if ((_lm.probs(boOrder))[index] == 0)
                    _numZeroProbs++;
                else
                    _totLogProb += log((_lm.probs(boOrder))[index]) * 1;
                _numWords++;
            }
        }
    }
    double entropy = -_totLogProb / (_numWords - _numZeroProbs);
//     std::cout << -_totLogProb << "\t" << _numWords << "\t" << _numZeroProbs << std::endl;
    if (Logger::GetVerbosity() > 2)
        std::cout << exp(entropy) << "\t" << params << std::endl;
    else
        Logger::Log(2, "%f\n", exp(entropy));
    return std::isnan(entropy) ? 70 : entropy;
}
コード例 #4
0
void
NgramModel::_LoadEntropy(vector<DoubleVector> &entropyVectors,
                         ZFile &corpusFile, size_t maxSize) const {
    if (corpusFile == NULL) throw std::invalid_argument("Invalid file");

    // Resize vectors and allocate counts.
    if (maxSize == 0 || maxSize > size())
        maxSize = size();
    int numDocs = 0;
    vector<CountVector> countVectors(maxSize);
    vector<CountVector> totCountVectors(maxSize);
    entropyVectors.resize(maxSize);
    for (size_t o = 0; o < maxSize; ++o) {
        countVectors[o].resize(sizes(o), 0);
        totCountVectors[o].resize(sizes(o), 0);
        entropyVectors[o].resize(sizes(o), 0);
    }

    // Accumulate counts for each n-gram in corpus file.
    char line[MAXLINE];
    vector<VocabIndex> words(256);
    vector<NgramIndex> hists(maxSize);
    while (corpusFile.getLine( line, MAXLINE)) {
        if (strcmp(line, "</DOC>") == 0) {
            // Accumulate frequency.
            numDocs++;
            for (size_t o = 1; o < countVectors.size(); ++o) {
                for (size_t i = 0; i < countVectors[o].length(); ++i) {
                    int c = countVectors[o][i];
                    if (c > 0) {
                        totCountVectors[o][i] += c;
                        entropyVectors[o][i] += c * log(c);
                        countVectors[o][i] = 0;
                    }
                }
            }
            continue;
        } else if (strncmp(line, "<DOC ", 5) == 0)
            continue;

        // Lookup vocabulary indices for each word in the line.
        words.clear();
        words.push_back(Vocab::EndOfSentence);
        char *p = &line[0];
        while (*p != '\0') {
            while (isspace(*p)) ++p;  // Skip consecutive spaces.
            const char *token = p;
            while (*p != 0 && !isspace(*p))  ++p;
            size_t len = p - token;
            if (*p != 0) *p++ = 0;
            words.push_back(_vocab.Find(token, len));
        }
        words.push_back(Vocab::EndOfSentence);

        // Add each order n-gram.
        hists[1] = _vectors[1].Find(0, Vocab::EndOfSentence);
        for (size_t i = 1; i < words.size(); ++i) {
            VocabIndex word = words[i];
            NgramIndex hist = 0;
            for (size_t j = 1; j < std::min(i + 2, maxSize); ++j) {
                if (word != Vocab::Invalid) {
                    NgramIndex index = _vectors[j].Find(hist, word);
                    if (index >= 0)
                        countVectors[j][index]++;
                    else
                        Logger::Warn(1, "DocFreq feature skipped.\n");
                    hist     = hists[j];
                    hists[j] = index;
                } else {
                    hist     = hists[j];
                    hists[j] = NgramVector::Invalid;
                }
            }
        }
    }

    // Finalize entropy computation.
    double invLogNumDocs = 1.0 / log((double)numDocs);
    for (size_t o = 1; o < maxSize; o++)
        entropyVectors[o] = CondExpr(
            totCountVectors[o] == 0, 0.0,
            ((entropyVectors[o] / -totCountVectors[o])
             + log(asDouble(totCountVectors[o]))) * invLogNumDocs);
}
コード例 #5
0
void
NgramModel::LoadCorpus(vector<CountVector> &countVectors,
                       ZFile &corpusFile, bool reset) {
    if (corpusFile == NULL) throw std::invalid_argument("Invalid file (corpusFile is NULL)");

    // Resize vectors and allocate counts.
    countVectors.resize(size());
    countVectors[0].resize(1, 0);
    for (size_t o = 1; o < size(); ++o) {
        size_t capacity = std::max(1ul<<16, nextPowerOf2(_vectors[o].size()));
        if (reset)
            countVectors[o].reset(capacity, 0);
        else
            countVectors[o].resize(capacity, 0);
    }

    // Accumulate counts for each n-gram in corpus file.
    char line[MAXLINE];
    vector<VocabIndex> words(256);
    vector<NgramIndex> hists(size(), -1);
    while (corpusFile.getLine( line, MAXLINE)) {
        // Lookup vocabulary indices for each word in the line.
        words.clear();
        words.push_back(Vocab::EndOfSentence);
        char *p = &line[0];
        while (*p != '\0') {
            while (isspace(*p)) ++p;  // Skip consecutive spaces.
            const char *token = p;
            while (*p != 0 && !isspace(*p))  ++p;
            size_t len = p - token;
            if (*p != 0) *p++ = 0;
            words.push_back(_vocab.Add(token, len));
        }
        words.push_back(Vocab::EndOfSentence);

        // Add each order n-gram.
        hists[1] = _vectors[1].Add(0, Vocab::EndOfSentence);
        for (size_t i = 1; i < words.size(); ++i) {
            VocabIndex word = words[i];
            NgramIndex hist = 0;
            for (size_t j = 1; j < std::min(i + 2, size()); ++j) {
                if (word != Vocab::Invalid && hist != NgramVector::Invalid) {
                    bool       newNgram;
                    NgramIndex index = _vectors[j].Add(hist, word, &newNgram);
                    if (newNgram && (size_t)index >= countVectors[j].length())
                        countVectors[j].resize(countVectors[j].length() * 2, 0);
                    countVectors[j][index]++;
                    hist     = hists[j];
                    hists[j] = index;
                } else {
                    hist     = hists[j];
                    hists[j] = NgramVector::Invalid;
                }
            }
        }
    }

    // Add remaining vocabulary, if necessary.
    if (_vectors[1].size() != _vocab.size()) {
        for (VocabIndex i = 0; i < (VocabIndex)_vocab.size(); ++i)
            _vectors[1].Add(0, i);
        countVectors[1].resize(_vocab.size(), 0);
    }

    // Sort and resize counts to actual size.
    VocabVector vocabMap;
    IndexVector ngramMap(1, 0), boNgramMap;
    _vocab.Sort(vocabMap);
    for (size_t o = 0; o < size(); ++o) {
        boNgramMap.swap(ngramMap);
        if (_vectors[o].Sort(vocabMap, boNgramMap, ngramMap))
            NgramModel::ApplySort(ngramMap, countVectors[o]);
        else
            countVectors[o].resize(ngramMap.length());
    }
    _ComputeBackoffs();
}
コード例 #6
0
void
NgramModel::LoadEvalCorpus(vector<CountVector> &probCountVectors,
                           vector<CountVector> &bowCountVectors,
                           BitVector &vocabMask,
                           ZFile &corpusFile,
                           size_t &outNumOOV,
                           size_t &outNumWords) const {
    if (corpusFile == NULL) throw std::invalid_argument("Invalid file");

    // Allocate count vectors.
    probCountVectors.resize(size());
    bowCountVectors.resize(size() - 1);
    for (size_t i = 0; i < size(); i++)
        probCountVectors[i].reset(_vectors[i].size(), 0);
    for (size_t i = 0; i < size() - 1; i++)
        bowCountVectors[i].reset(_vectors[i].size(), 0);

    // Accumulate counts of prob/bow for computing perplexity of corpusFilename.
    char                    line[MAXLINE];
    size_t                  numOOV = 0;
    size_t                  numWords = 0;
    vector<VocabIndex> words(256);
    while (corpusFile.getLine( line, MAXLINE)) {
        if (strncmp(line, "<DOC ", 5) == 0 || strcmp(line, "</DOC>") == 0)
            continue;
// 	Logger::Log(0, "Additional Input:%s\n", line);
        // Lookup vocabulary indices for each word in the line.
        words.clear();
//         words.push_back(Vocab::EndOfSentence);
        char *p = &line[0];
        while (*p != 0) {
            while (isspace(*p)) ++p;  // Skip consecutive spaces.
            const char *token = p;
            while (*p != 0 && !isspace(*p))  ++p;
            size_t len = p - token;
            if (*p != 0) *p++ = 0;
            words.push_back(_vocab.Find(token, len));
        }
//         words.push_back(Vocab::EndOfSentence);

        // Add each top order n-gram.
        size_t ngramOrder = std::min((size_t)2, size() - 1);
        for (size_t i = 1; i < words.size(); i++) {
            if (words[i] == Vocab::Invalid || !vocabMask[words[i]]) {
                // OOV word encountered.  Reset order to unigrams.
                ngramOrder = 1;
                numOOV++;
            } else {
                NgramIndex index;
                size_t     boOrder = ngramOrder;
                while ((index = _Find(&words[i-boOrder+1], boOrder)) == -1) {
                    --boOrder;
                    NgramIndex hist = _Find(&words[i - boOrder], boOrder);
                    if (hist != (NgramIndex)-1)
                        bowCountVectors[boOrder][hist]++;
                }
                ngramOrder = std::min(ngramOrder + 1, size() - 1);
                probCountVectors[boOrder][index]++;
                numWords++;
            }
        }
    }
    outNumOOV   = numOOV;
    outNumWords = numWords;
}
コード例 #7
0
void
NgramModel::LoadLM(vector<ProbVector> &probVectors,
                   vector<ProbVector> &bowVectors,
                   ZFile &lmFile) {
    if (lmFile == NULL) throw std::invalid_argument("Invalid file");

    // Read ARPA LM header.
    char           line[MAXLINE];
    size_t         o, len;
    vector<size_t> ngramLengths(1);
    while (lmFile.getLine( line, MAXLINE) && strcmp(line, "\\data\\") != 0)
        /* NOP */;
    while (lmFile.getLine( line, MAXLINE)) {
        unsigned int o, len;
        if (sscanf(line, "ngram %u=%u", &o, &len) != 2)
            break;
        assert(o == ngramLengths.size());
        ngramLengths.push_back(len);
    }

    // Allocate buffers and read counts.
    _vocab.Reserve(ngramLengths[1]);
    probVectors.resize(size());
    probVectors[0].resize(1, 0.0);
    bowVectors.resize(size() - 1);
    bowVectors[0].resize(1, 0.0);
    for (o = 1; o < size(); o++) {
        ProbVector &probs  = probVectors[o];
        ProbVector &bows   = bowVectors[o];
        bool        hasBow = (o < size() - 1);

        // Preallocate buffer for n-grams.
        _vectors[o].Reserve(ngramLengths[o]);
        probs.reset(ngramLengths[o]);
        if (hasBow) bows.reset(ngramLengths[o]);

        lmFile.getLine( line, MAXLINE);
        unsigned int i;
        if (sscanf(line, "\\%u-ngrams:", &i) != 1 || i != o) {
            throw std::invalid_argument("Unexpected file format.");
        }
        while (true) {
          lmFile.getLine( line, MAXLINE);
          size_t lineLen = strlen(line);
          if (line[0] == '\0') break;  // Empty line ends section.
          char *p = &line[0];
          
            // Read log probability.
            Prob prob = (Prob)pow(10.0, strtod(p, &p)); p++;

            // Read i words.
            NgramIndex index  = 0;
            const char *token = NULL;
            for (i = 1; i <= o; ++i) {
                token = p;
                while (*p != 0 && !isspace(*p)) ++p;
                len = p - token;
                *p++ = 0;
                VocabIndex vocabIndex = _vocab.Add(token, len);
                if (vocabIndex == Vocab::Invalid) {
                    index = NgramVector::Invalid;
                    break;
                }
                index = _vectors[i].Add(index, vocabIndex);
            }
            if (index == NgramVector::Invalid) break;

            // Set probability and backoff weight.
            if (index == Vocab::EndOfSentence && o == 1) {
                if (strcmp(token, "<s>") == 0) {
                    assert(prob <= pow(10, -99));
                    bows[index] = (p >= &line[lineLen]) ?
                        (Prob)1 : (Prob)pow(10.0, strtod(p, &p));
                } else {
                    probs[index] = prob;
                    assert(p >= &line[lineLen]);
                }
            } else {
                probs[index] = prob;
                if (hasBow) {
                    // Read optional backoff weight.
                    bows[index] = (p >= &line[lineLen]) ?
                        (Prob)1 : (Prob)pow(10.0, strtod(p, &p));
                }
            }
        }
    }

    // Read ARPA LM footer.
    while (lmFile.getLine( line, MAXLINE) &&
           strcmp(line, "\\end\\") != 0)  /* NOP */;

    // Sort and resize probs/bows to actual size.
    VocabVector vocabMap;
    IndexVector ngramMap(1, 0), boNgramMap;
    _vocab.Sort(vocabMap);
    for (size_t o = 0; o < size(); ++o) {
        boNgramMap.swap(ngramMap);
        if (_vectors[o].Sort(vocabMap, boNgramMap, ngramMap)) {
            NgramModel::ApplySort(ngramMap, probVectors[o]);
            if (o < bowVectors.size())
                NgramModel::ApplySort(ngramMap, bowVectors[o]);
        } else {
            probVectors[o].resize(ngramMap.length());
            if (o < bowVectors.size())
                bowVectors[o].resize(ngramMap.length());
        }
    }
    _ComputeBackoffs();
}
コード例 #8
0
void
NgramModel::LoadCounts(vector<CountVector> &countVectors,
                       ZFile &countsFile, bool reset) {
    if (countsFile == NULL) throw std::invalid_argument("Invalid file");

    // Resize vectors and allocate counts.
    countVectors.resize(size());
    countVectors[0].resize(1, 0);
    for (size_t o = 1; o < size(); ++o) {
        size_t capacity = std::max(1ul<<16, nextPowerOf2(_vectors[o].size()));
        if (reset)
            countVectors[o].reset(capacity, 0);
        else
            countVectors[o].resize(capacity, 0);
    }

    // Accumulate counts for each n-gram in counts file.
    char                    line[MAXLINE];
    vector<VocabIndex> words(256);
    while (countsFile.getLine( line, MAXLINE)) {
        if (line[0] == '\0' || line[0] == '#') continue;

        words.clear();
        char *p = &line[0];
        while (words.size() < size()) {
            const char *token = p;
            while (*p != 0 && !isspace(*p))  ++p;
            if (*p == 0) {
                // Last token in line: Add ngram with count
                bool       newNgram;
                size_t     order = words.size();
                NgramIndex index = 0;
                for (size_t i = 1; i < order; ++i)
                    index = _vectors[i].Add(index, words[i - 1]);
                index = _vectors[order].Add(index, words[order - 1], &newNgram);
                if (newNgram && (size_t)index >= countVectors[order].length())
                    countVectors[order].resize(countVectors[order].length() * 2,
                                               0);
                countVectors[order][index] += atoi(token);
                break;  // Move to next line.
            }

            // Not the last token: Lookup word index and add to words.
            size_t len = p - token;
            *p++ = 0;
            if (len > 0) {
                VocabIndex vocabIndex = _vocab.Add(token, len);
                if (vocabIndex == Vocab::Invalid) break;
                words.push_back(vocabIndex);
            }
        }
    }

    // Sort and resize counts to actual size.
    VocabVector vocabMap;
    IndexVector ngramMap(1, 0), boNgramMap;
    _vocab.Sort(vocabMap);
    for (size_t o = 0; o < size(); ++o) {
        boNgramMap.swap(ngramMap);
        if (_vectors[o].Sort(vocabMap, boNgramMap, ngramMap))
            NgramModel::ApplySort(ngramMap, countVectors[o]);
        else
            countVectors[o].resize(ngramMap.length());
    }
    _ComputeBackoffs();
}