int main(int argc, char** argv) { try { if (getoptions(argc, argv) != 0) return 1; if (! temporary_dir.empty()) ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1); expgram::NGram ngram(ngram_file, shards, debug); utils::compress_ostream os(output_file); os << "ngram order: " << ngram.index.order() << '\n'; for (int order = 1; order <= ngram.index.order(); ++ order) os << order << "-gram: " << std::setw(16) << ngram.index.ngram_size(order) << '\n'; dump(os, "index", ngram.stat_index()) << '\n'; dump(os, "pointer", ngram.stat_pointer()) << '\n'; dump(os, "vocab", ngram.stat_vocab()) << '\n'; dump(os, "logprob", ngram.stat_logprob()) << '\n'; dump(os, "backoff", ngram.stat_backoff()) << '\n'; dump(os, "logbound", ngram.stat_logbound()) << '\n'; } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]] // This data structure: m_refs[sent_id][[vector<length>][ngrams]] void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs) { m_refs.clear(); FactorCollection& fc = FactorCollection::Instance(); for (size_t file_id = 0; file_id < refs.size(); file_id++) { for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) { const string& ref = refs[file_id][sent_id]; vector<string> refTokens = Tokenize(ref); if (file_id == 0) m_refs[sent_id] = RefValue(); pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id]; (ref_pair.first).push_back(refTokens.size()); for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) { for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) { Phrase ngram(1); for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) { const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]); Word w; w.SetFactor(0, f); ngram.AddWord(w); } ref_pair.second[ngram] += 1; } } } } // cerr << "Number of ref files: " << refs.size() << endl; // for (size_t i = 0; i < m_refs.size(); ++i) { // cerr << "Sent id " << i << ", number of references: " << (m_refs[i].first).size() << endl; // } }
int main(int argc, char** argv) { utils::mpi_world mpi_world(argc, argv); const int mpi_rank = MPI::COMM_WORLD.Get_rank(); const int mpi_size = MPI::COMM_WORLD.Get_size(); try { if (getoptions(argc, argv) != 0) return 1; if (! temporary_dir.empty()) ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1); if (ngram_file.empty() || ! boost::filesystem::exists(ngram_file)) throw std::runtime_error("no ngram file?"); if (output_file.empty()) throw std::runtime_error("no output file?"); if (ngram_file == output_file) throw std::runtime_error("dump to the same directory?"); ngram_type ngram(debug); ngram.open_shard(ngram_file, mpi_rank); if (static_cast<int>(ngram.index.size()) != mpi_size) throw std::runtime_error("MPI universe size do not match with ngram shard size"); utils::resource start; ngram_quantize(ngram); utils::resource end; if (debug && mpi_rank == 0) std::cerr << "quantize language model" << " cpu time: " << end.cpu_time() - start.cpu_time() << " user time: " << end.user_time() - start.user_time() << std::endl; if (mpi_rank == 0) ngram.write_prepare(output_file); MPI::COMM_WORLD.Barrier(); ngram.write_shard(output_file, mpi_rank); } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; MPI::COMM_WORLD.Abort(1); return 1; } return 0; }
NgramString ngramExtract(const std::string& string) { NgramString result; for (size_t i = 3; i < string.length(); ++i) { char a = string[i - 3], b = string[i - 2], c = string[i - 1], d = string[i]; unsigned int n = ngram(casefold(a), casefold(b), casefold(c), casefold(d)); result.push_back(n); } return result; }
std::vector<uint> BleuModel::calculateClippedCounts(const BleuModel::Tokens_ candidate_tokens, const uint sent_no, const uint doc_no) const { //LOG(logger_, debug, "BleuModel::calculateClippedCounts"); TokenHash_ tmp_counts; TokenHash_ reference = refNgramCounts_[doc_no][sent_no]; std::vector<uint> clipped_counts(4); // loop over values of n from 1 to 4 (these are the n-grams used in calculating the BLEU score) for(uint n=0; n<4; n++){ clipped_counts[n] = 0; if(candidate_tokens.size()>n){ for(uint ngram_no=0; ngram_no<candidate_tokens.size()-n; ngram_no++){ Tokens_ ngram(n+1); ngram[0] = candidate_tokens[ngram_no]; for(uint j=1; j<=n; j++){ ngram[j] = candidate_tokens[ngram_no+j]; } // check if n gram is in the reference translation for this sentence if (reference.find(ngram) != reference.end()) { // test if the n-gram has already been seen in this sentence, and if not create an entry for it with count 1 if (tmp_counts.find(ngram) == tmp_counts.end()) { tmp_counts.insert(std::make_pair(ngram,1)); } // otherwise add one to its count else { tmp_counts[ngram]++; } // add one to the clipped count for the sentence, as long as the ngram hasn't been seen more times than it appears in the reference if(!(tmp_counts[ngram]>reference[ngram])){ clipped_counts[n]++; } } } } } return clipped_counts; }
int main(int argc, const char *argv[]) { char testword[] = "Testi"; struct gram *np = NULL; setlocale(LC_ALL, ""); ngram(testword, sizeof(testword), 2); SLIST_FOREACH(np, &grams_head, grams) fprintf(stdout, "%s\n", np->buf); free_gramlist(); return 0; }
/** \brief Builds the required n-gram and returns its count. * * \param tokens tokens[i] contains ContextTracker::getToken(i) * \param offset entry point into tokens, must be a non-positive number * \param ngram_size size of the ngram whose count is returned, must not be greater than tokens size * \return count of the ngram built based on tokens, offset and ngram_size * * \verbatim Let tokens = [ "how", "are", "you", "today" ]; count(tokens, 0, 3) returns the count associated with 3-gram [ "are", "you", "today" ]. count(tokens, -1, 2) returns the count associated with 2-gram [ "are", "you" ]; * \endverbatim * */ unsigned int SmoothedNgramPredictor::count(const std::vector<std::string>& tokens, int offset, int ngram_size) const { unsigned int result = 0; assert(offset <= 0); // TODO: handle this better assert(ngram_size >= 0); if (ngram_size > 0) { Ngram ngram(ngram_size); copy(tokens.end() - ngram_size + offset , tokens.end() + offset, ngram.begin()); result = db->getNgramCount(ngram); logger << DEBUG << "count ngram: " << ngram_to_string (ngram) << " : " << result << endl; } else { result = db->getUnigramCountsSum(); logger << DEBUG << "unigram counts sum: " << result << endl; } return result; }
int main(int argc, char** argv) { try { if (getoptions(argc, argv) != 0) return 1; expgram::NGramCounts ngram(ngram_file, shards, debug); std::string line; tokens_type tokens; const int order = ngram.index.order(); while (std::getline(std::cin, line)) { tokenizer_type tokenizer(line); // Here, we store in vector<string>. tokens.clear(); tokens.insert(tokens.end(), tokenizer.begin(), tokenizer.end()); // // Alternatively, you can try: (Remark: sentence is simply vector<word_type>) // // sentence.clear(); // sentence.insert(sentence.end(), tokenizer.begin(), tokenizer.end()); // // and iterate over sentence type, not tokens. // // // The above example still automatically convert word_type into word_type::id_type on the fly. // An alternative faster approach is: (Remark: we assume id_set is vector<word_type::id_type> ) // // id_set.clear(); // for (tokenizer_type::iterator titer = tokenizer.begin(); titer != tokenizer.end(); ++ titer) // id_set.push_back(ngram.index.vocab()[*titer]); // // then, you can iterate over id_set, not tokens. // // // Note that the word_type will automatically assign id which may not // match with the word-id assigned by the indexed ngram language model. // This means that even OOV by the ngram language model may be assigned word-id. // If you want to avoid this, here is a solution: // // const word_type::id_type unk_id = ngram.index.vocab()[vocab_type::UNK] // // id_set.clear(); // for (tokenizer_type::iterator titer = tokenizer.begin(); titer != tokenizer.end(); ++ titer) // id_set.push_back(ngram.index.vocab().exists(*titer) ? ngram.index.vocab()[*titer] : unk_id); // // ngram access must use containser that supports forward-iterator concepts. // If not sure, use vector! std::copy(tokens.begin(), tokens.end(), std::ostream_iterator<std::string>(std::cout, " ")); std::cout << ngram(tokens.begin(), tokens.end()) << std::endl; } } catch (std::exception& err) { std::cerr << "error: " << err.what() << std::endl; return 1; } return 0; }
// constructor BleuModel::BleuModel(const Parameters ¶ms) : logger_("BleuModel"){ LOG(logger_, debug, "BleuModel::BleuModel"); typedef boost::shared_ptr<NistXmlDocument> value_type; typedef std::vector<value_type>::iterator iterator; typedef std::vector<Word>::iterator word_iterator; std::string fileName = params.get<std::string>("reference-file", ""); LOG(logger_, debug, "Reading file: " << fileName); NistXmlRefset Refset = NistXmlRefset(fileName); LOG(logger_, debug, "Number of documents in Refset = " << Refset.size() << "\n"); // Loop over the documents in the refset for(iterator it = Refset.begin(); it != Refset.end(); ++it){ uint doc_length = 0; // Number of words in the current ref doc std::vector<uint> sent_lengths; // Number of words in each sentence of the current ref doc std::vector<TokenHash_> sent_counts; // Ngram counts for each sentence of the current ref doc uint number_sents = (*it)->asPlainTextDocument().getNumberOfSentences(); //LOG(logger_, debug, "Number of sentences: " << number_sents); PlainTextDocument plain_doc = (*it)->asPlainTextDocument(); // Loop over the sentences in the current document for(uint sentno=0;sentno<number_sents;sentno++){ Tokens_ tokens; uint sentence_length = 0; //LOG(logger_, debug, "Sentence " << sentno); for(word_iterator word_it = plain_doc.sentence_begin(sentno); word_it != plain_doc.sentence_end(sentno); ++word_it){ tokens.push_back(*word_it); sentence_length++; doc_length++; //LOG(logger_, debug, "Word: " << *word_it); } sent_lengths.push_back(sentence_length); //LOG(logger_, debug, "Finished reading sentence: " << sentence); LOG(logger_, debug, "Size of tokens vector = " << tokens.size()); TokenHash_ sentence_counts; // loop over values of n from 1 to 4 (these are the n-grams used in calculating the BLEU score) for(uint n=0; n<4; n++){ //LOG(logger_, debug, "n = " << n); if(tokens.size()>n){ // loop over the n-grams in the current sentence. for(uint i = 0; i < tokens.size()-n; i++) { //LOG(logger_, debug, "i = " << i); // create a blank n-gram of the correct size and add the first token Tokens_ ngram(n+1); ngram[0] = tokens[i]; //std::string ngram = tokens[i]; for(uint j=1; j<=n; j++){ //LOG(logger_, debug, "j = " << j); ngram[j] = tokens[i+j]; } // test if the n-gram has already been seen, and if not create an entry for it with count 1 if (sentence_counts.find(ngram) == sentence_counts.end()) { sentence_counts.insert(std::make_pair(ngram,1)); } // otherwise simply add one to its count else { sentence_counts[ngram]++; } } } } // add the current sentence counts to the vector for the current document sent_counts.push_back(sentence_counts); } refLength_.push_back(doc_length); refSentLengths_.push_back(sent_lengths); refNgramCounts_.push_back(sent_counts); } };
void SmoothedNgramPredictor::learn(const std::vector<std::string>& change) { logger << INFO << "learn(\"" << ngram_to_string(change) << "\")" << endl; if (learn_mode) { // learning is turned on std::map<std::list<std::string>, int> ngramMap; // build up ngram map for all cardinalities // i.e. learn all ngrams and counts in memory for (size_t curr_cardinality = 1; curr_cardinality < cardinality + 1; curr_cardinality++) { int change_idx = 0; int change_size = change.size(); std::list<std::string> ngram_list; // take care of first N-1 tokens for (int i = 0; (i < curr_cardinality - 1 && change_idx < change_size); i++) { ngram_list.push_back(change[change_idx]); change_idx++; } while (change_idx < change_size) { ngram_list.push_back(change[change_idx++]); ngramMap[ngram_list] = ngramMap[ngram_list] + 1; ngram_list.pop_front(); } } // use (past stream - change) to learn token at the boundary // change, i.e. // // if change is "bar foobar", then "bar" will only occur in a // 1-gram, since there are no token before it. By dipping in // the past stream, we additional context to learn a 2-gram by // getting extra tokens (assuming past stream ends with token // "foo": // // <"foo", "bar"> will be learnt // // We do this till we build up to n equal to cardinality. // // First check that change is not empty (nothing to learn) and // that change and past stream match by sampling first and // last token in change and comparing them with corresponding // tokens from past stream // if (change.size() > 0 && change.back() == contextTracker->getToken(1) && change.front() == contextTracker->getToken(change.size())) { // create ngram list with first (oldest) token from change std::list<std::string> ngram_list(change.begin(), change.begin() + 1); // prepend token to ngram list by grabbing extra tokens // from past stream (if there are any) till we have built // up to n==cardinality ngrams, and commit them to // ngramMap // for (int tk_idx = 1; ngram_list.size() < cardinality; tk_idx++) { // getExtraTokenToLearn returns tokens from // past stream that come before and are not in // change vector // std::string extra_token = contextTracker->getExtraTokenToLearn(tk_idx, change); logger << DEBUG << "Adding extra token: " << extra_token << endl; if (extra_token.empty()) { break; } ngram_list.push_front(extra_token); ngramMap[ngram_list] = ngramMap[ngram_list] + 1; } } // then write out to language model database try { db->beginTransaction(); std::map<std::list<std::string>, int>::const_iterator it; for (it = ngramMap.begin(); it != ngramMap.end(); it++) { // convert ngram from list to vector based Ngram Ngram ngram((it->first).begin(), (it->first).end()); // update the counts int count = db->getNgramCount(ngram); if (count > 0) { // ngram already in database, update count db->updateNgram(ngram, count + it->second); check_learn_consistency(ngram); } else { // ngram not in database, insert it db->insertNgram(ngram, it->second); } } db->endTransaction(); logger << INFO << "Committed learning update to database" << endl; } catch (SqliteDatabaseConnector::SqliteDatabaseConnectorException& ex) { db->rollbackTransaction(); logger << ERROR << "Rolling back learning update : " << ex.what() << endl; throw; } } logger << DEBUG << "end learn()" << endl; }
int main(int argc, char *argv[]) { char *word = NULL; char *ptr = NULL; char line[MAXLINE]; char delim = ".,:;'/\"+-_(){}[]<>*&^%$#@!?|=~/|\\=1234567890 \t\n"; FILE *fp; int opt; int slice = 2; if(argc == 1) { printf("Incorrect!!"); return 1; } //Add a welcome function i.e., help function. // I have no idea what ngram function does. while((opt = getopt(argc,argv,"wn: ucil: \n")) != -1) { switch(opt) { case 'w': welcome_message(); return 0; case 'n': if(strchr(optarg,'-') == NULL) { slice = atoi(optarg); if(slice < 1 || slice > 10) { fprintf(stderr, "%s: Error: Value too low or too high. Select n in range of 1 to 10 \n",PACKAGE); return 1; } else { /* We can use this in case a range is given instead of a number */ } } break; case 'u': u_flag = 1; break; case 'c': c_flag = 1; case 'i': i_flag = 1; break; case 'l': l_flag = 1; break; case ':': fprintf(stderr,"%s: Error: Option needs to be an argument ",PACKAGE); welcome_message(); break; case '?': fprintf(stderr,"%s: Error: Option needs to be an argument ",PACKAGE); welcome_message(); break; } } while ((fgets(line, MAXLINE, stdin)) != NULL) { if(strlen(line)>1) { if(l_flag == 1) for(ptr = line; *ptr;ptr++) if(isupper(*ptr)) *ptr = tolower(*ptr) word = strtok(line,delim); while(word != NULL) { if(i_flag == 1 && strlen(word) < slice) { word = strtok(NULL,delim); continue; } if(r_flag == 0) ngram(word,slice); word = strtok(NULL,delim); } } } if(fp == NULL) printTree(root,stdout); else printTree(root,fp), pclose(fp); return 0; }
int main(int argc, char** argv) { if (argc < 2) { std::cout << argv[0] << " ngram-pyp-file" << std::endl; return 1; } cicada::NGramNN lm(argv[1], true); sentence_type sentence; sentence_type ngram(1, vocab_type::BOS); double logprob_total = 0.0; double logprob = 0.0; size_t num_word = 0; size_t num_oov = 0; size_t num_sentence = 0; while (std::cin >> sentence) { ngram.resize(1); double logprob_sentence = 0.0; sentence_type::const_iterator siter_end = sentence.end(); for (sentence_type::const_iterator siter = sentence.begin(); siter != siter_end; ++ siter) { ngram.push_back(*siter); const bool is_oov = ! lm.vocab().exists(*siter); const double lp = lm(std::max(ngram.begin(), ngram.end() - lm.order()), ngram.end()); std::cerr << "word logprob: " << lp << " oov: " << is_oov << std::endl; if (! is_oov) logprob_total += lp; logprob += lp; logprob_sentence += lp; num_oov += is_oov; } ngram.push_back(vocab_type::EOS); const double lp = lm(std::max(ngram.begin(), ngram.end() - lm.order()), ngram.end()); logprob_total += lp; logprob += lp; logprob_sentence += lp; std::cerr << "logprob: " << logprob_sentence << std::endl; num_word += sentence.size(); ++ num_sentence; } std::cout << "# of sentences: " << num_sentence << " # of words: " << num_word << " # of OOV: " << num_oov << " order: " << lm.order() << std::endl; std::cout << "logprob = " << logprob_total << std::endl; std::cerr << "logprob(+oov) = " << logprob << std::endl; std::cout << "ppl = " << utils::mathop::exp(- logprob_total / (num_word - num_oov + num_sentence)) << std::endl; std::cout << "ppl1 = " << utils::mathop::exp(- logprob_total / (num_word - num_oov)) << std::endl; std::cerr << "ppl(+oov) = " << utils::mathop::exp(- logprob / (num_word + num_sentence)) << std::endl; std::cerr << "ppl1(+oov) = " << utils::mathop::exp(- logprob / (num_word)) << std::endl; }