int main(int argc, char** argv)
{
  try {
    if (getoptions(argc, argv) != 0) 
      return 1;

    if (! temporary_dir.empty())
      ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1);
    
    expgram::NGram ngram(ngram_file, shards, debug);
    
    utils::compress_ostream os(output_file);
    os << "ngram order: " << ngram.index.order() << '\n';
    for (int order = 1; order <= ngram.index.order(); ++ order)
      os << order << "-gram: " << std::setw(16) << ngram.index.ngram_size(order) << '\n';
    
    dump(os, "index",    ngram.stat_index()) << '\n';
    dump(os, "pointer",  ngram.stat_pointer()) << '\n';
    dump(os, "vocab",    ngram.stat_vocab()) << '\n';
    dump(os, "logprob",  ngram.stat_logprob()) << '\n';
    dump(os, "backoff",  ngram.stat_backoff()) << '\n';
    dump(os, "logbound", ngram.stat_logbound()) << '\n';
  }
  catch (std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}
// Incoming references (refs) are stored as refs[file_id][[sent_id][reference]]
// This data structure: m_refs[sent_id][[vector<length>][ngrams]]
void BleuScoreFeature::LoadReferences(const std::vector< std::vector< std::string > >& refs)
{
    m_refs.clear();
    FactorCollection& fc = FactorCollection::Instance();
    for (size_t file_id = 0; file_id < refs.size(); file_id++) {
        for (size_t sent_id = 0; sent_id < refs[file_id].size(); sent_id++) {
            const string& ref = refs[file_id][sent_id];
            vector<string> refTokens  = Tokenize(ref);
            if (file_id == 0)
                m_refs[sent_id] = RefValue();
            pair<vector<size_t>,NGrams>& ref_pair = m_refs[sent_id];
            (ref_pair.first).push_back(refTokens.size());
            for (size_t order = 1; order <= BleuScoreState::bleu_order; order++) {
                for (size_t end_idx = order; end_idx <= refTokens.size(); end_idx++) {
                    Phrase ngram(1);
                    for (size_t s_idx = end_idx - order; s_idx < end_idx; s_idx++) {
                        const Factor* f = fc.AddFactor(Output, 0, refTokens[s_idx]);
                        Word w;
                        w.SetFactor(0, f);
                        ngram.AddWord(w);
                    }
                    ref_pair.second[ngram] += 1;
                }
            }
        }
    }

//	cerr << "Number of ref files: " << refs.size() << endl;
//	for (size_t i = 0; i < m_refs.size(); ++i) {
//		cerr << "Sent id " << i << ", number of references: " << (m_refs[i].first).size() << endl;
//	}
}
int main(int argc, char** argv)
{
  utils::mpi_world  mpi_world(argc, argv);
  
  const int mpi_rank = MPI::COMM_WORLD.Get_rank();
  const int mpi_size = MPI::COMM_WORLD.Get_size();  
  
  try {
    if (getoptions(argc, argv) != 0) 
      return 1;

    if (! temporary_dir.empty())
      ::setenv("TMPDIR_SPEC", temporary_dir.string().data(), 1);
    
    if (ngram_file.empty() || ! boost::filesystem::exists(ngram_file))
      throw std::runtime_error("no ngram file?");
    if (output_file.empty())
      throw std::runtime_error("no output file?");
    if (ngram_file == output_file)
      throw std::runtime_error("dump to the same directory?");
    
    ngram_type ngram(debug);
    ngram.open_shard(ngram_file, mpi_rank);

    if (static_cast<int>(ngram.index.size()) != mpi_size)
      throw std::runtime_error("MPI universe size do not match with ngram shard size");
    
    utils::resource start;

    ngram_quantize(ngram);

    utils::resource end;
    
    if (debug && mpi_rank == 0)
      std::cerr << "quantize language model"
		<< " cpu time:  " << end.cpu_time() - start.cpu_time() 
		<< " user time: " << end.user_time() - start.user_time()
		<< std::endl;
    
    if (mpi_rank == 0)
      ngram.write_prepare(output_file);
    
    MPI::COMM_WORLD.Barrier();
    ngram.write_shard(output_file, mpi_rank);
  }
  catch (std::exception& err) {
    std::cerr << "error: "  << err.what() << std::endl;
    MPI::COMM_WORLD.Abort(1);
    return 1;
  }
  return 0;
}
Exemple #4
0
NgramString ngramExtract(const std::string& string)
{
	NgramString result;

	for (size_t i = 3; i < string.length(); ++i)
	{
		char a = string[i - 3], b = string[i - 2], c = string[i - 1], d = string[i];
		unsigned int n = ngram(casefold(a), casefold(b), casefold(c), casefold(d));
		result.push_back(n);
	}

	return result;
}
Exemple #5
0
std::vector<uint> BleuModel::calculateClippedCounts(const BleuModel::Tokens_ candidate_tokens, const uint sent_no, const uint doc_no) const {
	
	//LOG(logger_, debug, "BleuModel::calculateClippedCounts");

	TokenHash_ tmp_counts;
	TokenHash_ reference = refNgramCounts_[doc_no][sent_no];

	std::vector<uint> clipped_counts(4);

	// loop over values of n from 1 to 4 (these are the n-grams used in calculating the BLEU score)
	for(uint n=0; n<4; n++){
			
		clipped_counts[n] = 0;
			
		if(candidate_tokens.size()>n){

			for(uint ngram_no=0; ngram_no<candidate_tokens.size()-n; ngram_no++){

				Tokens_ ngram(n+1);

				ngram[0] = candidate_tokens[ngram_no];
				for(uint j=1; j<=n; j++){		
					ngram[j] = candidate_tokens[ngram_no+j];
				}

				// check if n gram is in the reference translation for this sentence	
				if (reference.find(ngram) != reference.end()) {

					// test if the n-gram has already been seen in this sentence, and if not create an entry for it with count 1				
					if (tmp_counts.find(ngram) == tmp_counts.end()) {
						tmp_counts.insert(std::make_pair(ngram,1));
					}
					// otherwise add one to its count
					else {
						tmp_counts[ngram]++;
					}

					// add one to the clipped count for the sentence, as long as the ngram hasn't been seen more times than it appears in the reference
					if(!(tmp_counts[ngram]>reference[ngram])){
						clipped_counts[n]++;
					}

				}
				
			}
		}
	}
	return clipped_counts;
}
Exemple #6
0
int
main(int argc, const char *argv[])
{
	char		 testword[] = "Testi";
	struct gram	*np = NULL;

	setlocale(LC_ALL, "");

	ngram(testword, sizeof(testword), 2);
	SLIST_FOREACH(np, &grams_head, grams)
		fprintf(stdout, "%s\n", np->buf);

	free_gramlist();

	return 0;
}
/** \brief Builds the required n-gram and returns its count.
 *
 * \param tokens tokens[i] contains ContextTracker::getToken(i)
 * \param offset entry point into tokens, must be a non-positive number
 * \param ngram_size size of the ngram whose count is returned, must not be greater than tokens size
 * \return count of the ngram built based on tokens, offset and ngram_size
 *
 * \verbatim
 Let tokens = [ "how", "are", "you", "today" ];

 count(tokens,  0, 3) returns the count associated with 3-gram [ "are", "you", "today" ].
 count(tokens, -1, 2) returns the count associated with 2-gram [ "are", "you" ];
 * \endverbatim
 *
 */
unsigned int SmoothedNgramPredictor::count(const std::vector<std::string>& tokens, int offset, int ngram_size) const
{
    unsigned int result = 0;

    assert(offset <= 0);      // TODO: handle this better
    assert(ngram_size >= 0);

    if (ngram_size > 0) {
	Ngram ngram(ngram_size);
	copy(tokens.end() - ngram_size + offset , tokens.end() + offset, ngram.begin());
	result = db->getNgramCount(ngram);
	logger << DEBUG << "count ngram: " << ngram_to_string (ngram) << " : " << result << endl;
    } else {
	result = db->getUnigramCountsSum();
	logger << DEBUG << "unigram counts sum: " << result << endl;
    }

    return result;
}
int main(int argc, char** argv)
{
  try {
    if (getoptions(argc, argv) != 0) 
      return 1;
    
    expgram::NGramCounts ngram(ngram_file, shards, debug);
    
    std::string  line;
    tokens_type  tokens;

    const int order = ngram.index.order();
    
    while (std::getline(std::cin, line)) {
      tokenizer_type tokenizer(line);
      
      // Here, we store in vector<string>.
      
      tokens.clear();
      tokens.insert(tokens.end(), tokenizer.begin(), tokenizer.end());
      
      
      //
      // Alternatively, you can try: (Remark: sentence is simply vector<word_type>)
      // 
      // sentence.clear();
      // sentence.insert(sentence.end(), tokenizer.begin(), tokenizer.end());
      //
      // and iterate over sentence type, not tokens.
      //
      
      //
      // The above example still automatically convert word_type into word_type::id_type on the fly.
      // An alternative faster approach is: (Remark: we assume id_set is vector<word_type::id_type> )
      //
      // id_set.clear();
      // for (tokenizer_type::iterator titer = tokenizer.begin(); titer != tokenizer.end(); ++ titer)
      //   id_set.push_back(ngram.index.vocab()[*titer]);
      //
      // then, you can iterate over id_set, not tokens.
      //
      
      // 
      // Note that the word_type will automatically assign id which may not
      // match with the word-id assigned by the indexed ngram language model.
      // This means that even OOV by the ngram language model may be assigned word-id.
      // If you want to avoid this, here is a solution:
      //
      // const word_type::id_type unk_id = ngram.index.vocab()[vocab_type::UNK]
      //
      // id_set.clear();
      // for (tokenizer_type::iterator titer = tokenizer.begin(); titer != tokenizer.end(); ++ titer)
      //   id_set.push_back(ngram.index.vocab().exists(*titer) ? ngram.index.vocab()[*titer] : unk_id);
      //

      // ngram access must use containser that supports forward-iterator concepts.
      // If not sure, use vector!

      std::copy(tokens.begin(), tokens.end(), std::ostream_iterator<std::string>(std::cout, " "));
      std::cout << ngram(tokens.begin(), tokens.end()) << std::endl;
    }
  }
  catch (std::exception& err) {
    std::cerr << "error: " << err.what() << std::endl;
    return 1;
  }
  return 0;
}
Exemple #9
0
// constructor
BleuModel::BleuModel(const Parameters &params) : logger_("BleuModel"){

	LOG(logger_, debug, "BleuModel::BleuModel");

	typedef boost::shared_ptr<NistXmlDocument> value_type;
        typedef std::vector<value_type>::iterator iterator;
        typedef std::vector<Word>::iterator word_iterator;

	std::string fileName = params.get<std::string>("reference-file", "");
	LOG(logger_, debug, "Reading file: " << fileName);
	NistXmlRefset Refset = NistXmlRefset(fileName);

	LOG(logger_, debug, "Number of documents in Refset = " << Refset.size() << "\n");

	// Loop over the documents in the refset
        for(iterator it = Refset.begin(); it != Refset.end(); ++it){
		
		uint doc_length = 0; // Number of words in the current ref doc
		std::vector<uint> sent_lengths; // Number of words in each sentence of the current ref doc
		std::vector<TokenHash_> sent_counts; // Ngram counts for each sentence of the current ref doc	

		uint number_sents = (*it)->asPlainTextDocument().getNumberOfSentences();
		//LOG(logger_, debug, "Number of sentences: " << number_sents);
		PlainTextDocument plain_doc = (*it)->asPlainTextDocument();

		// Loop over the sentences in the current document
		for(uint sentno=0;sentno<number_sents;sentno++){
			Tokens_ tokens;
			uint sentence_length = 0;
			//LOG(logger_, debug, "Sentence " << sentno);		
			for(word_iterator word_it = plain_doc.sentence_begin(sentno); word_it != plain_doc.sentence_end(sentno); ++word_it){
				tokens.push_back(*word_it);
				sentence_length++;
				doc_length++;		
				//LOG(logger_, debug, "Word: " << *word_it);
			}
			sent_lengths.push_back(sentence_length);
			//LOG(logger_, debug, "Finished reading sentence: " << sentence);
			LOG(logger_, debug, "Size of tokens vector = " << tokens.size());
			
			TokenHash_ sentence_counts;		

			// loop over values of n from 1 to 4 (these are the n-grams used in calculating the BLEU score)
			for(uint n=0; n<4; n++){
			//LOG(logger_, debug, "n = " << n);				
				if(tokens.size()>n){		
					// loop over the n-grams in the current sentence.		
					for(uint i = 0; i < tokens.size()-n; i++) {
						//LOG(logger_, debug, "i = " << i);				
				
						// create a blank n-gram of the correct size and add the first token
						Tokens_ ngram(n+1);
						ngram[0] = tokens[i];					
						//std::string ngram = tokens[i];
						for(uint j=1; j<=n; j++){
							//LOG(logger_, debug, "j = " << j);					
							ngram[j] = tokens[i+j];				
						}
						// test if the n-gram has already been seen, and if not create an entry for it with count 1				
						if (sentence_counts.find(ngram) == sentence_counts.end()) {
							sentence_counts.insert(std::make_pair(ngram,1));
						}
						// otherwise simply add one to its count
						else {
							sentence_counts[ngram]++;
						}
					}
				}
			}

			// add the current sentence counts to the vector for the current document
			sent_counts.push_back(sentence_counts);

		}
		refLength_.push_back(doc_length);		
		refSentLengths_.push_back(sent_lengths);	
		refNgramCounts_.push_back(sent_counts);
	}

};
void SmoothedNgramPredictor::learn(const std::vector<std::string>& change)
{
    logger << INFO << "learn(\"" << ngram_to_string(change) << "\")" << endl;

    if (learn_mode) {
	// learning is turned on

	std::map<std::list<std::string>, int> ngramMap;

	// build up ngram map for all cardinalities
	// i.e. learn all ngrams and counts in memory
	for (size_t curr_cardinality = 1;
	     curr_cardinality < cardinality + 1;
	     curr_cardinality++)
	{
	    int change_idx = 0;
	    int change_size = change.size();

	    std::list<std::string> ngram_list;

	    // take care of first N-1 tokens
	    for (int i = 0;
		 (i < curr_cardinality - 1 && change_idx < change_size);
		 i++)
	    {
		ngram_list.push_back(change[change_idx]);
		change_idx++;
	    }

	    while (change_idx < change_size)
	    {
		ngram_list.push_back(change[change_idx++]);
		ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
		ngram_list.pop_front();
	    }
	}

	// use (past stream - change) to learn token at the boundary
	// change, i.e.
	//

	// if change is "bar foobar", then "bar" will only occur in a
	// 1-gram, since there are no token before it. By dipping in
	// the past stream, we additional context to learn a 2-gram by
	// getting extra tokens (assuming past stream ends with token
	// "foo":
	//
	// <"foo", "bar"> will be learnt
	//
	// We do this till we build up to n equal to cardinality.
	//
	// First check that change is not empty (nothing to learn) and
	// that change and past stream match by sampling first and
	// last token in change and comparing them with corresponding
	// tokens from past stream
	//
	if (change.size() > 0 &&
	    change.back() == contextTracker->getToken(1) &&
	    change.front() == contextTracker->getToken(change.size()))
	{
	    // create ngram list with first (oldest) token from change
	    std::list<std::string> ngram_list(change.begin(), change.begin() + 1);

	    // prepend token to ngram list by grabbing extra tokens
	    // from past stream (if there are any) till we have built
	    // up to n==cardinality ngrams, and commit them to
	    // ngramMap
	    //
	    for (int tk_idx = 1;
		 ngram_list.size() < cardinality;
		 tk_idx++)
	    {
		// getExtraTokenToLearn returns tokens from
		// past stream that come before and are not in
		// change vector
		//
		std::string extra_token = contextTracker->getExtraTokenToLearn(tk_idx, change);
		logger << DEBUG << "Adding extra token: " << extra_token << endl;

		if (extra_token.empty())
		{
		    break;
		}
		ngram_list.push_front(extra_token);

		ngramMap[ngram_list] = ngramMap[ngram_list] + 1;
	    }
	}

	// then write out to language model database
	try
	{
	    db->beginTransaction();

	    std::map<std::list<std::string>, int>::const_iterator it;
	    for (it = ngramMap.begin(); it != ngramMap.end(); it++)
	    {
		// convert ngram from list to vector based Ngram
		Ngram ngram((it->first).begin(), (it->first).end());

		// update the counts
		int count = db->getNgramCount(ngram);
		if (count > 0)
		{
		    // ngram already in database, update count
		    db->updateNgram(ngram, count + it->second);
		    check_learn_consistency(ngram);
		}
		else
		{
		    // ngram not in database, insert it
		    db->insertNgram(ngram, it->second);
		}
	    }

	    db->endTransaction();
	    logger << INFO << "Committed learning update to database" << endl;
	}
	catch (SqliteDatabaseConnector::SqliteDatabaseConnectorException& ex)
	{
	    db->rollbackTransaction();
	    logger << ERROR << "Rolling back learning update : " << ex.what() << endl;
	    throw;
	}
    }

    logger << DEBUG << "end learn()" << endl;
}
Exemple #11
0
int main(int argc, char *argv[]) {
	char *word = NULL;
	char *ptr = NULL;
	char line[MAXLINE];
	char delim = ".,:;'/\"+-_(){}[]<>*&^%$#@!?|=~/|\\=1234567890 \t\n";
	FILE *fp;
	
	int opt;
	int slice = 2;
		
	if(argc == 1) {
		printf("Incorrect!!");
		return 1;
	}
	//Add a welcome function i.e., help function. 
	// I have no idea what ngram function does. 
	while((opt = getopt(argc,argv,"wn: ucil: \n")) != -1) {
		switch(opt) {
			case 'w':
				welcome_message();
				return 0;
			case 'n':
				if(strchr(optarg,'-') == NULL) {
					slice = atoi(optarg);
					if(slice < 1 || slice > 10) {
						fprintf(stderr, "%s: Error: Value too low or too high. Select n in range of 1 to 10 \n",PACKAGE);
						return 1;
					}
					else {
						/* We can use this in case a range is given instead of a number */
					}
				}
				break;
			case 'u':
				u_flag = 1;
				break;
			case 'c':
				c_flag = 1;
			case 'i':
				i_flag = 1;
				break;
			case 'l':
				l_flag = 1;
				break;
			case ':':
				fprintf(stderr,"%s: Error: Option needs to be an argument ",PACKAGE);
				welcome_message();
				break;
			case '?':
				fprintf(stderr,"%s: Error: Option needs to be an argument ",PACKAGE);
				welcome_message();
				break;
		}
	}
	
	while ((fgets(line, MAXLINE, stdin)) != NULL) {
		if(strlen(line)>1) {
			if(l_flag == 1)
				for(ptr = line; *ptr;ptr++)
					if(isupper(*ptr)) *ptr = tolower(*ptr)
			word = strtok(line,delim);
			while(word != NULL) {
				if(i_flag == 1 && strlen(word) < slice) {
					word = strtok(NULL,delim);
					continue;
				}
				if(r_flag == 0) 
					ngram(word,slice);
				
				word = strtok(NULL,delim);
			}
		}
	}
	if(fp == NULL) 
		printTree(root,stdout);
	else
		printTree(root,fp), pclose(fp);
	return 0;
}
Exemple #12
0
int main(int argc, char** argv)
{
  if (argc < 2) {
    std::cout << argv[0] << " ngram-pyp-file" << std::endl;
    return 1;
  }
  
  cicada::NGramNN lm(argv[1], true);
  
  sentence_type sentence;
  sentence_type ngram(1, vocab_type::BOS);
  
  double logprob_total = 0.0;
  double logprob = 0.0;
  size_t num_word = 0;
  size_t num_oov = 0;
  size_t num_sentence = 0;
  
  while (std::cin >> sentence) {
    ngram.resize(1);

    double logprob_sentence = 0.0;
	  
    sentence_type::const_iterator siter_end = sentence.end();
    for (sentence_type::const_iterator siter = sentence.begin(); siter != siter_end; ++ siter) {
      ngram.push_back(*siter);
      
      const bool is_oov = ! lm.vocab().exists(*siter);
      const double lp = lm(std::max(ngram.begin(), ngram.end() - lm.order()), ngram.end());
      
      std::cerr << "word logprob: " << lp << " oov: " << is_oov << std::endl;
      
      if (! is_oov)
	logprob_total += lp;
      logprob += lp;
      logprob_sentence += lp;
      
      num_oov += is_oov;
    }
    
    ngram.push_back(vocab_type::EOS);
    
    const double lp = lm(std::max(ngram.begin(), ngram.end() - lm.order()), ngram.end());
    
    logprob_total += lp;
    logprob += lp;
    logprob_sentence += lp;
    
    std::cerr << "logprob: " << logprob_sentence << std::endl;
    
    num_word += sentence.size();
    ++ num_sentence;
  }
  
  std::cout << "# of sentences: " << num_sentence
	    << " # of words: " << num_word
	    << " # of OOV: " << num_oov
	    << " order: " << lm.order()
	    << std::endl;
  
  std::cout << "logprob       = " << logprob_total << std::endl;
  std::cerr << "logprob(+oov) = " << logprob << std::endl;
  std::cout << "ppl           = " << utils::mathop::exp(- logprob_total / (num_word - num_oov + num_sentence)) << std::endl;
  std::cout << "ppl1          = " << utils::mathop::exp(- logprob_total / (num_word - num_oov)) << std::endl;
  std::cerr << "ppl(+oov)     = " << utils::mathop::exp(- logprob / (num_word + num_sentence)) << std::endl;
  std::cerr << "ppl1(+oov)    = " << utils::mathop::exp(- logprob / (num_word)) << std::endl;
}