int PhraseAlignment::Compare(const PhraseAlignment &other) const
	{
		if (this == &other) // comparing with itself
			return 0;

		if (GetTarget() != other.GetTarget()) 
			return ( GetTarget() < other.GetTarget() ) ? -1 : +1;

		if (GetSource() != other.GetSource())
			return ( GetSource() < other.GetSource() ) ? -1 : +1;

		if (!hierarchicalFlag) 
			return 0;

		// loop over all words (note: 0 = left hand side of rule)
		for(size_t i=0; i<phraseT.size()-1; i++) {
			if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
				size_t thisAlign = *(alignedToT[i].begin());
				size_t otherAlign = *(other.alignedToT[i].begin());

				if (alignedToT[i].size() != 1 ||
					other.alignedToT[i].size() != 1 ||
					thisAlign != otherAlign)
				{
					int ret = (thisAlign < otherAlign) ? -1 : +1;
					return ret;
				}
			}
		}
		return 0;

	}
	// check if two word alignments between a phrase pair are the same
	bool PhraseAlignment::equals( const PhraseAlignment& other )
	{
		if (this == &other) return true;
		if (other.GetTarget() != GetTarget()) return false;
		if (other.GetSource() != GetSource()) return false;
		if (other.alignedToT != alignedToT) return false;
		if (other.alignedToS != alignedToS) return false;
		return true;
	}
Beispiel #3
0
    int PhraseAlignment::Compare(const PhraseAlignment &other) const
    {
        if (this == &other) // comparing with itself
            return 0;

        if (GetTarget() != other.GetTarget()) //先比的是目标端
            return ( GetTarget() < other.GetTarget() ) ? -1 : +1;

        if (GetSource() != other.GetSource())
            return ( GetSource() < other.GetSource() ) ? -1 : +1;

        return 0;

    }
Beispiel #4
0
 // check if two word alignments between a phrase pair are the same
 bool PhraseAlignment::equals( const PhraseAlignment& other )
 {
     if (this == &other) return true;
     if (other.GetRuleId() != this->GetRuleId() ) return false;
     if (other.alignedToT != alignedToT) return false;
     if (other.alignedToS != alignedToS) return false;
     return true;
 }
	// check if two word alignments between a phrase pairs "match"
	// i.e. they do not differ in the alignment of non-termimals
	bool PhraseAlignment::match( const PhraseAlignment& other )
	{
		if (this == &other) return true;
		if (other.GetTarget() != GetTarget()) return false;
		if (other.GetSource() != GetSource()) return false;
		if (!hierarchicalFlag) return true;

		assert(phraseT.size() == alignedToT.size() + 1);
		assert(alignedToT.size() == other.alignedToT.size());

		// loop over all words (note: 0 = left hand side of rule)
		for(size_t i=0; i<phraseT.size()-1; i++) {
			if (isNonTerminal( vcbT.getWord( phraseT[i] ) )) {
				if (alignedToT[i].size() != 1 ||
					other.alignedToT[i].size() != 1 ||
					*(alignedToT[i].begin()) != *(other.alignedToT[i].begin()))
					return false;
			}
		}
		return true;
	}
int main(int argc, char* argv[])
{
  cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n"
       << "modifying PhraseScore v1.4 written by Philipp Koehn\n"
       << "It computes statistics for extracted phrase pairs\n"
       << "if (direct):\n"
       << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n"
       << "if (inverse)\n"
       << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n";

  if (argc != 4 && argc != 5) {
    cerr << "syntax: statistics extract lex phrase-table [inverse]\n";
    exit(1);
  }
  char* &fileNameExtract = argv[1];
  char* &fileNameLex = argv[2];
  char* &fileNamePhraseTable = argv[3];
  inverseFlag = false;
  if (argc > 4) {
    inverseFlag = true;
    cerr << "using inverse mode\n";
  }

  // lexical translation table
  lexTable.load( fileNameLex );

  // sorted phrase extraction file
  Moses::InputFileStream extractFile(fileNameExtract);

  if (extractFile.fail()) {
    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
    exit(1);
  }
  istream &extractFileP = extractFile;

  // output file: phrase translation table
  phraseTableFile.open(fileNamePhraseTable);
  if (phraseTableFile.fail()) {
    cerr << "ERROR: could not open file phrase table file "
         << fileNamePhraseTable << endl;
    exit(1);
  }

  // loop through all extracted phrase translations
  int lastForeign = -1;
  vector< PhraseAlignment > phrasePairsWithSameF;
  int i=0;

  string line;
  while(getline(extractFileP, line)) {
    if (extractFileP.eof()) break;
    if (++i % 100000 == 0) cerr << "." << flush;

    PhraseAlignment phrasePair;
    bool isPhrasePair = phrasePair.create( line.c_str(), i );
    if (lastForeign >= 0 && lastForeign != phrasePair.foreign) {
      processPhrasePairs( phrasePairsWithSameF );
      for(size_t j=0; j<phrasePairsWithSameF.size(); j++)
        phrasePairsWithSameF[j].clear();
      phrasePairsWithSameF.clear();
      phraseTableE.clear();
      phraseTableF.clear();
      phrasePair.clear(); // process line again, since phrase tables flushed
      phrasePair.create( line.c_str(), i );
      phrasePairBase = 0;
    }
    lastForeign = phrasePair.foreign;
    if (isPhrasePair)
      phrasePairsWithSameF.push_back( phrasePair );
    else
      phrasePairBase++;
  }
  processPhrasePairs( phrasePairsWithSameF );
  phraseTableFile.close();
}
Beispiel #7
0
int main(int argc, char* argv[])
{
  cerr << "Score v2.0 written by Philipp Koehn\n"
       << "scoring methods for extracted rules\n";

  if (argc < 4) {
    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--SourceSyntax] [--TargetSyntax] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring coc-file] [--KneserNey coc-file] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] \n";
    exit(1);
  }
  char* fileNameExtract = argv[1];
  char* fileNameLex = argv[2];
  char* fileNamePhraseTable = argv[3];
  char* fileNameCountOfCounts;
  char* fileNameFunctionWords;

  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
      inverseFlag = true;
      cerr << "using inverse mode\n";
    } else if (strcmp(argv[i],"--Hierarchical") == 0) {
      hierarchicalFlag = true;
      cerr << "processing hierarchical rules\n";
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      sourceSyntaxFlag = true;
      cerr << "using source syntax\n";
    } else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      targetSyntaxFlag = true;
      cerr << "using target syntax\n";
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      pcfgFlag = true;
      cerr << "including PCFG scores\n";
    } else if (strcmp(argv[i],"--WordAlignment") == 0) {
      wordAlignmentFlag = true;
      cerr << "outputing word alignment" << endl;
    } else if (strcmp(argv[i],"--NoLex") == 0) {
      lexFlag = false;
      cerr << "not computing lexical translation score\n";
    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
      goodTuringFlag = true;
      if (i+1==argc) { 
        cerr << "ERROR: specify count of count files for Good Turing discounting!\n";
        exit(1);
      }
      fileNameCountOfCounts = argv[++i];
      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
    } else if (strcmp(argv[i],"--KneserNey") == 0) {
      kneserNeyFlag = true;
      if (i+1==argc) { 
        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
        exit(1);
      }
      fileNameCountOfCounts = argv[++i];
      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
    } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
      unalignedFlag = true;
      cerr << "using unaligned word penalty\n";
    } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
      unalignedFWFlag = true;
      if (i+1==argc) { 
        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
        exit(1);
      }
      fileNameFunctionWords = argv[++i];
      cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      cerr << "using log-probabilities\n";
    } else if (strcmp(argv[i],"--NegLogProb") == 0) {
      logProbFlag = true;
      negLogProb = -1;
      cerr << "using negative log-probabilities\n";
    } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
      minCountHierarchical = atof(argv[++i]);
      cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
      minCountHierarchical -= 0.00001; // account for rounding
    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
      outputNTLengths = true;
    } else {
      cerr << "ERROR: unknown option " << argv[i] << endl;
      exit(1);
    }
  }

  // is model string-to-tree?  perhaps confusingly, this always refers to the
  // forward direction.  same goes for sourceSyntaxFlag and targetSyntaxFlag.
  stringToTreeFlag = (!sourceSyntaxFlag && targetSyntaxFlag);

  // lexical translation table
  if (lexFlag)
    lexTable.load( fileNameLex );

  // function word list
  if (unalignedFWFlag)
    loadFunctionWords( fileNameFunctionWords );

  // compute count of counts for Good Turing discounting
  if (goodTuringFlag || kneserNeyFlag) {
    for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
  }

  // sorted phrase extraction file
  Moses::InputFileStream extractFile(fileNameExtract);

  if (extractFile.fail()) {
    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
    exit(1);
  }
  istream &extractFileP = extractFile;

  // output file: phrase translation table
	ostream *phraseTableFile;

	if (strcmp(fileNamePhraseTable, "-") == 0) {
		phraseTableFile = &cout;
	}
	else {
		ofstream *outputFile = new ofstream();
		outputFile->open(fileNamePhraseTable);
		if (outputFile->fail()) {
			cerr << "ERROR: could not open file phrase table file "
					 << fileNamePhraseTable << endl;
			exit(1);
		}
		phraseTableFile = outputFile;
	}
	
  // loop through all extracted phrase translations
  float lastCount = 0.0f;
  float lastPcfgSum = 0.0f;
  vector< PhraseAlignment > phrasePairsWithSameF;
  int i=0;
  char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
  lastLine[0] = '\0';
  PhraseAlignment *lastPhrasePair = NULL;
  while(true) {
    if (extractFileP.eof()) break;
    if (++i % 100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (extractFileP.eof())	break;

    // identical to last line? just add count
    if (strcmp(line,lastLine) == 0) {
      lastPhrasePair->count += lastCount;
      lastPhrasePair->pcfgSum += lastPcfgSum;
      continue;
    }
    strcpy( lastLine, line );

    // create new phrase pair
    PhraseAlignment phrasePair;
    phrasePair.create( line, i );
    lastCount = phrasePair.count;
    lastPcfgSum = phrasePair.pcfgSum;

    // only differs in count? just add count
    if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) {
      lastPhrasePair->count += phrasePair.count;
      lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
      continue;
    }

    // if new source phrase, process last batch
    if (lastPhrasePair != NULL &&
        lastPhrasePair->GetSource() != phrasePair.GetSource()) {
      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
      phrasePairsWithSameF.clear();
      lastPhrasePair = NULL;
    }

    // add phrase pairs to list, it's now the last one
    phrasePairsWithSameF.push_back( phrasePair );
    lastPhrasePair = &phrasePairsWithSameF.back();
  }
  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile );
	
	phraseTableFile->flush();
	if (phraseTableFile != &cout) {
		(dynamic_cast<ofstream*>(phraseTableFile))->close();
		delete phraseTableFile;
	}

  // output count of count statistics
  if (goodTuringFlag || kneserNeyFlag) {
    writeCountOfCounts( fileNameCountOfCounts );
  }
}
Beispiel #8
0
 // check if two word alignments between a phrase pairs "match"
 // i.e. they do not differ in the alignment of non-termimals
 bool PhraseAlignment::match( const PhraseAlignment& other )
 {
     if (this == &other) return true;
     if(other.GetRuleId() != this->GetRuleId()) return false;
     return true;
 }
Beispiel #9
0
int main(int argc, char* argv[]) 
{
	cerr << "Score v2.0 written by Philipp Koehn\n"
	     << "scoring methods for extracted rules\n";

	if (argc < 4) {
		cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n";
		exit(1);
	}
	char* fileNameExtract = argv[1];
	char* fileNameLex = argv[2];
	char* fileNamePhraseTable = argv[3];

	for(int i=4;i<argc;i++) {
		if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
			inverseFlag = true;
			cerr << "using inverse mode\n";
		}
		else if (strcmp(argv[i],"--Hierarchical") == 0) {
			hierarchicalFlag = true;
			cerr << "processing hierarchical rules\n";
		}
		else if (strcmp(argv[i],"--OnlyDirect") == 0) {
			onlyDirectFlag = true;
			cerr << "outputing in correct phrase table format (no merging with inverse)\n";
		}
		else if (strcmp(argv[i],"--WordAlignment") == 0) {
			wordAlignmentFlag = true;
			cerr << "outputing word alignment" << endl;
		}
		else if (strcmp(argv[i],"--NoLex") == 0) {
			lexFlag = false;
			cerr << "not computing lexical translation score\n";
		}
		else if (strcmp(argv[i],"--GoodTuring") == 0) {
			goodTuringFlag = true;
			cerr << "using Good Turing discounting\n";
		}
		else if (strcmp(argv[i],"--LogProb") == 0) {
			logProbFlag = true;
			cerr << "using log-probabilities\n";
		}
		else if (strcmp(argv[i],"--NegLogProb") == 0) {
			logProbFlag = true;
			negLogProb = -1;
			cerr << "using negative log-probabilities\n";
		}
		else {
			cerr << "ERROR: unknown option " << argv[i] << endl;
			exit(1);
		}
	}

	// lexical translation table
	if (lexFlag)
		lexTable.load( fileNameLex );
  
	// compute count of counts for Good Turing discounting
	if (goodTuringFlag)
		computeCountOfCounts( fileNameExtract );

	// sorted phrase extraction file
	ifstream extractFile;
	extractFile.open(fileNameExtract);
	if (extractFile.fail()) {
		cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
		exit(1);
	}
	istream &extractFileP = extractFile;

	// output file: phrase translation table
	phraseTableFile.open(fileNamePhraseTable);
	if (phraseTableFile.fail()) 
	{
		cerr << "ERROR: could not open file phrase table file " 
		     << fileNamePhraseTable << endl;
		exit(1);
	}
  
  // loop through all extracted phrase translations
  int lastSource = -1;
  vector< PhraseAlignment > phrasePairsWithSameF;
  int i=0;
	char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
	lastLine[0] = '\0';
	PhraseAlignment *lastPhrasePair = NULL;
  while(true) {
    if (extractFileP.eof()) break;
    if (++i % 100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (extractFileP.eof())	break;
				
		// identical to last line? just add count
		if (lastSource > 0 && strcmp(line,lastLine) == 0)
		{
			lastPhrasePair->addToCount( line );
			continue;			
		}
		strcpy( lastLine, line );

		// create new phrase pair
		PhraseAlignment phrasePair;
		phrasePair.create( line, i );
		
		// only differs in count? just add count
		if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ))
		{
			lastPhrasePair->count += phrasePair.count;
			phrasePair.clear();
			continue;
		}
		
		// if new source phrase, process last batch
		if (lastSource >= 0 && lastSource != phrasePair.GetSource()) {
			processPhrasePairs( phrasePairsWithSameF );
			for(int j=0;j<phrasePairsWithSameF.size();j++)
				phrasePairsWithSameF[j].clear();
			phrasePairsWithSameF.clear();
			phraseTableT.clear();
			phraseTableS.clear();
			// process line again, since phrase tables flushed
			phrasePair.clear();
			phrasePair.create( line, i ); 
		}
		
		// add phrase pairs to list, it's now the last one
		lastSource = phrasePair.GetSource();
		phrasePairsWithSameF.push_back( phrasePair );
		lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1];
	}
	processPhrasePairs( phrasePairsWithSameF );
	phraseTableFile.close();
}
Beispiel #10
0
void computeCountOfCounts( char* fileNameExtract )
{
	cerr << "computing counts of counts";
	for(int i=1;i<=GT_MAX;i++) countOfCounts[i] = 0;

	ifstream extractFile;
	extractFile.open( fileNameExtract );
	if (extractFile.fail()) {
		cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
		exit(1);
	}
	istream &extractFileP = extractFile;

	// loop through all extracted phrase translations
	int i=0;
	char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
	lastLine[0] = '\0';
	PhraseAlignment *lastPhrasePair = NULL;
	while(true) {
		if (extractFileP.eof()) break;
		if (++i % 100000 == 0) cerr << "." << flush;
		SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
		if (extractFileP.eof())	break;
		
		// identical to last line? just add count
		if (strcmp(line,lastLine) == 0)
		{
			lastPhrasePair->addToCount( line );
			continue;			
		}
		strcpy( lastLine, line );

		// create new phrase pair
		PhraseAlignment *phrasePair = new PhraseAlignment();
		phrasePair->create( line, i );
		
		if (i == 1)
		{
			lastPhrasePair = phrasePair;
			continue;
		}

		// only differs in count? just add count
		if (lastPhrasePair->match( *phrasePair ))
		{
			lastPhrasePair->count += phrasePair->count;
			phrasePair->clear();
			delete(phrasePair);
			continue;
		}

		// periodically house cleaning
		if (phrasePair->GetSource() != lastPhrasePair->GetSource())
		{
			phraseTableT.clear(); // these would get too big
			phraseTableS.clear(); // these would get too big
			// process line again, since phrase tables flushed
			phrasePair->clear();
			phrasePair->create( line, i ); 
		}

		int count = lastPhrasePair->count + 0.99999;
		if(count <= GT_MAX)
			countOfCounts[ count ]++;
		lastPhrasePair->clear();
		delete( lastPhrasePair );
		lastPhrasePair = phrasePair;
	}
	
	delete lastPhrasePair;

	discountFactor[0] = 0.01; // floor
	cerr << "\n";
	for(int i=1;i<GT_MAX;i++)
	{
		discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1));
		cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i];
		// some smoothing...
		if (discountFactor[i]>1) 
			discountFactor[i] = 1;
		if (discountFactor[i]<discountFactor[i-1])
			discountFactor[i] = discountFactor[i-1];
		cerr << " -> " << discountFactor[i]*i << endl;
	}
}
Beispiel #11
0
int main(int argc, char* argv[])
{
  cerr << "Score v2.0 written by Philipp Koehn\n"
       << "scoring methods for extracted rules\n";

  if (argc < 4) {
    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n";
    exit(1);
  }
  string fileNameExtract = argv[1];
  string fileNameLex = argv[2];
  string fileNamePhraseTable = argv[3];
  string fileNameCountOfCounts;
  char* fileNameFunctionWords = NULL;
  char* fileNameDomain = NULL;

  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
      inverseFlag = true;
      cerr << "using inverse mode\n";
    } else if (strcmp(argv[i],"--Hierarchical") == 0) {
      hierarchicalFlag = true;
      cerr << "processing hierarchical rules\n";
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      pcfgFlag = true;
      cerr << "including PCFG scores\n";
    } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
      unpairedExtractFormatFlag = true;
      cerr << "processing unpaired extract format\n";
    } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
      conditionOnTargetLhsFlag = true;
      cerr << "processing unpaired extract format\n";
    } else if (strcmp(argv[i],"--WordAlignment") == 0) {
      wordAlignmentFlag = true;
      cerr << "outputing word alignment" << endl;
    } else if (strcmp(argv[i],"--NoLex") == 0) {
      lexFlag = false;
      cerr << "not computing lexical translation score\n";
    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
      goodTuringFlag = true;
			fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
    } else if (strcmp(argv[i],"--KneserNey") == 0) {
      kneserNeyFlag = true;
			fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
    } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
      unalignedFlag = true;
      cerr << "using unaligned word penalty\n";
    } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
      unalignedFWFlag = true;
      if (i+1==argc) { 
        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
        exit(1);
      }
      fileNameFunctionWords = argv[++i];
      cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
    } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 ||
               strcmp(argv[i],"--SparseDomainRatio") == 0 ||
               strcmp(argv[i],"--SparseDomainSubset") == 0 ||
               strcmp(argv[i],"--DomainIndicator") == 0 ||
               strcmp(argv[i],"--DomainRatio") == 0 ||
               strcmp(argv[i],"--DomainSubset") == 0) {
      includeSentenceIdFlag = true;
      domainFlag = true;
      domainSparseFlag = strstr( argv[i], "Sparse" );
      domainRatioFlag = strstr( argv[i], "Ratio" );
      domainSubsetFlag = strstr( argv[i], "Subset" );
      if (i+1==argc) {
        cerr << "ERROR: specify domain info file with " << argv[i] << endl;
        exit(1);
      }
      fileNameDomain = argv[++i];
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      cerr << "using log-probabilities\n";
    } else if (strcmp(argv[i],"--NegLogProb") == 0) {
      logProbFlag = true;
      negLogProb = -1;
      cerr << "using negative log-probabilities\n";
    } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
      minCountHierarchical = atof(argv[++i]);
      cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
      minCountHierarchical -= 0.00001; // account for rounding
    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
      outputNTLengths = true;
    } else if (strcmp(argv[i],"--Singleton") == 0) {
      singletonFeature = true;
      cerr << "binary singleton feature\n";
    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
      crossedNonTerm = true;
      cerr << "crossed non-term reordering feature\n";
    } else {
      cerr << "ERROR: unknown option " << argv[i] << endl;
      exit(1);
    }
  }

  // lexical translation table
  if (lexFlag)
    lexTable.load( fileNameLex );

  // function word list
  if (unalignedFWFlag)
    loadFunctionWords( fileNameFunctionWords );

  // load domain information
  if (domainFlag) {
    if (inverseFlag) {
      domainFlag = false;
      includeSentenceIdFlag = false;
    }
    else {
      domain = new Domain;
      domain->load( fileNameDomain );
    }
  }

  // compute count of counts for Good Turing discounting
  if (goodTuringFlag || kneserNeyFlag) {
    for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
  }

  // sorted phrase extraction file
  Moses::InputFileStream extractFile(fileNameExtract);

  if (extractFile.fail()) {
    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
    exit(1);
  }
  istream &extractFileP = extractFile;

  // output file: phrase translation table
	ostream *phraseTableFile;

	if (fileNamePhraseTable == "-") {
		phraseTableFile = &cout;
	}
	else {
		Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
		bool success = outputFile->Open(fileNamePhraseTable);
		if (!success) {
			cerr << "ERROR: could not open file phrase table file "
					 << fileNamePhraseTable << endl;
			exit(1);
		}
		phraseTableFile = outputFile;
	}
	
  // loop through all extracted phrase translations
  float lastCount = 0.0f;
  float lastPcfgSum = 0.0f;
  vector< PhraseAlignment > phrasePairsWithSameF;
  bool isSingleton = true;
  int i=0;
  char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
  lastLine[0] = '\0';
  PhraseAlignment *lastPhrasePair = NULL;
  while(true) {
    if (extractFileP.eof()) break;
    if (++i % 100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (extractFileP.eof())	break;

    // identical to last line? just add count
    if (strcmp(line,lastLine) == 0) {
      lastPhrasePair->count += lastCount;
      lastPhrasePair->pcfgSum += lastPcfgSum;
      continue;
    }
    strcpy( lastLine, line );

    // create new phrase pair
    PhraseAlignment phrasePair;
    phrasePair.create( line, i, includeSentenceIdFlag );
    lastCount = phrasePair.count;
    lastPcfgSum = phrasePair.pcfgSum;

    // only differs in count? just add count
    if (lastPhrasePair != NULL 
	&& lastPhrasePair->equals( phrasePair )
	&& (!domainFlag
	    || domain->getDomainOfSentence( lastPhrasePair->sentenceId )
	    == domain->getDomainOfSentence( phrasePair.sentenceId ) )) {
      lastPhrasePair->count += phrasePair.count;
      lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
      continue;
    }
    
    // if new source phrase, process last batch
    if (lastPhrasePair != NULL &&
        lastPhrasePair->GetSource() != phrasePair.GetSource()) {
      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
      
      phrasePairsWithSameF.clear();
      isSingleton = false;
      lastPhrasePair = NULL;
    }
    else
    {
      isSingleton = true;
    }

    // add phrase pairs to list, it's now the last one
    phrasePairsWithSameF.push_back( phrasePair );
    lastPhrasePair = &phrasePairsWithSameF.back();
  }
  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
	
	phraseTableFile->flush();
	if (phraseTableFile != &cout) {
		delete phraseTableFile;
	}

  // output count of count statistics
  if (goodTuringFlag || kneserNeyFlag) {
    writeCountOfCounts( fileNameCountOfCounts );
  }
}
int main(int argc, char* argv[]) {
	cerr	<< "Score v2.5 written by Philipp Koehn" << endl
				<< "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl
				<< "scoring methods for extracted rules" << endl
	;

	if (argc < 4) {
		cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n";
		exit(1);
	}
	char* fileNameExtract = argv[1];
	char* fileNameLex = argv[2];
	char* fileNamePhraseTable = argv[3];
	char* fileNameWordAlignment;

	for(int i=4; i<argc; ++i) {
		if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
			inverseFlag = true;
			cerr << "using inverse mode\n";
		}
		else if (strcmp(argv[i],"--Hierarchical") == 0) {
			hierarchicalFlag = true;
			cerr << "processing hierarchical rules\n";
		}
		else if (strcmp(argv[i],"--OnlyDirect") == 0) {
			onlyDirectFlag = true;
			cerr << "outputing in correct phrase table format (no merging with inverse)\n";
		}
		else if (strcmp(argv[i],"--WordAlignment") == 0) {
			wordAlignmentFlag = true;
			fileNameWordAlignment = argv[++i];
			cerr << "outputing word alignment in file " << fileNameWordAlignment << endl;
		}
		else if (strcmp(argv[i],"--NoLex") == 0) {
			lexFlag = false;
			cerr << "not computing lexical translation score\n";
		}
		else if (strcmp(argv[i],"--GoodTuring") == 0) {
			goodTuringFlag = true;
			cerr << "using Good Turing discounting\n";
		}
		else if (strcmp(argv[i],"--LogProb") == 0) {
			logProbFlag = true;
			cerr << "using log-probabilities\n";
		}
		else if (strcmp(argv[i],"--NegLogProb") == 0) {
			logProbFlag = true;
			negLogProb = -1;
			cerr << "using negative log-probabilities\n";
		}
		else {
			cerr << "ERROR: unknown option " << argv[i] << endl;
			exit(1);
		}
	}

	// lexical translation table
	if (lexFlag)
		lexTable.load(fileNameLex);
  
	// compute count of counts for Good Turing discounting
	if (goodTuringFlag)
		computeCountOfCounts(fileNameExtract);

	// sorted phrase extraction file
	Bz2LineReader extractFile(fileNameExtract);

	// output file: phrase translation table
	Bz2LineWriter phraseTableFile(fileNamePhraseTable);

	// output word alignment file
	if (!inverseFlag && wordAlignmentFlag) {
		wordAlignmentFile.open(fileNameWordAlignment);
		if (wordAlignmentFile.fail()) {
			cerr << "ERROR: could not open word alignment file "
			     << fileNameWordAlignment << endl;
			exit(1);
		}
	}
  
  // loop through all extracted phrase translations
  int lastSource = -1;
  vector< PhraseAlignment > phrasePairsWithSameF;
  int i=0;
	string lastLine = "";
	PhraseAlignment *lastPhrasePair = NULL;
	for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) {
		if (line.empty()) break;
		if ((++i)%10000000 == 0) cerr << "[p. score:" << i << "]" << flush;
    else if (i % 100000 == 0) cerr << "." << flush;
		
		// identical to last line? just add count
		if (lastSource >= 0 && line == lastLine) {
			lastPhrasePair->addToCount(line);
			continue;
		}
		lastLine = line;

		// create new phrase pair
		PhraseAlignment phrasePair;
		vector<string> lineVector = tokenize(line.c_str());
		phrasePair.create(lineVector, i);
		
		// only differs in count? just add count
		if (lastPhrasePair != NULL && lastPhrasePair->equals(phrasePair)) {
			lastPhrasePair->count += phrasePair.count;
			phrasePair.clear();
			continue;
		}
		
		// if new source phrase, process last batch
		if (lastSource >= 0 && lastSource != phrasePair.source) {
			processPhrasePairs(phrasePairsWithSameF, phraseTableFile);
			for (size_t j=0; j<phrasePairsWithSameF.size(); phrasePairsWithSameF[j++].clear());
			phrasePairsWithSameF.clear();
			phraseTableT.clear();
			phraseTableS.clear();
			// process line again, since phrase tables flushed
			phrasePair.clear();
			phrasePair.create(lineVector, i);
		}
		
		// add phrase pairs to list, it's now the last one
		lastSource = phrasePair.source;
		phrasePairsWithSameF.push_back(phrasePair);
		lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1];
	}
	processPhrasePairs(phrasePairsWithSameF, phraseTableFile);
	if (!inverseFlag && wordAlignmentFlag)
		wordAlignmentFile.close();
}
void computeCountOfCounts(const string& fileNameExtract) {
	if (fileNameExtract == "-") {
		cerr << "The ‘GoodTuring Discounting’ option may not be used with piped input!" << endl;
		exit(9);
	}

	cerr << "computing counts of counts";
	for (size_t i=1; i<=GT_MAX; countOfCounts[i++] = 0);

	Bz2LineReader extractFile(fileNameExtract);

	// loop through all extracted phrase translations
	int i=0;
	string lastLine;
	PhraseAlignment *lastPhrasePair = NULL;

	for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) {
		if (line.empty()) break;
		if ((++i)%10000000 == 0) cerr << "[" << i << "]" << endl;
    else if (i % 100000 == 0) cerr << "," << flush;
		
		// identical to last line? just add count
		if (line == lastLine) {
			lastPhrasePair->addToCount(line);
			continue;			
		}
		lastLine = line;

		// create new phrase pair
		PhraseAlignment *phrasePair = new PhraseAlignment();
		vector<string> lineVector = tokenize(line.c_str());
		phrasePair->create(lineVector, i);
		
		if (i == 1) {
			lastPhrasePair = phrasePair;
			continue;
		}

		// only differs in count? just add count
		if (lastPhrasePair->match( *phrasePair )) {
			lastPhrasePair->count += phrasePair->count;
			phrasePair->clear();
			delete(phrasePair);
			continue;
		}

		// periodically house cleaning
		if (phrasePair->source != lastPhrasePair->source) {
			phraseTableT.clear(); // these would get too big
			phraseTableS.clear(); // these would get too big
			// process line again, since phrase tables flushed
			phrasePair->clear();
			phrasePair->create(lineVector, i); 
		}

		int count = lastPhrasePair->count + 0.99999;
		if(count <= GT_MAX)
			++countOfCounts[ count ];
		lastPhrasePair->clear();
		delete( lastPhrasePair );
		lastPhrasePair = phrasePair;
	}

	discountFactor[0] = 0.01; // floor
	cerr << "\n";
	for(int i=1;i<GT_MAX; ++i) {
		discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1));
		cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i];
		// some smoothing...
		if (discountFactor[i]>1) 
			discountFactor[i] = 1;
		if (discountFactor[i]<discountFactor[i-1])
			discountFactor[i] = discountFactor[i-1];
		cerr << " -> " << discountFactor[i]*i << endl;
	}
}