int main(int argc, char* argv[]) { cerr << "PhraseStatistics v1.1 written by Nicola Bertoldi\n" << "modifying PhraseScore v1.4 written by Philipp Koehn\n" << "It computes statistics for extracted phrase pairs\n" << "if (direct):\n" << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(src_phrase) length(src_phrase) length(trg_phrase)\n" << "if (inverse)\n" << "src_phrase ||| trg_phrase || freq(src_phrase, trg_phrase) freq(trg_phrase) length(src_phrase) length(trg_phrase)\n"; if (argc != 4 && argc != 5) { cerr << "syntax: statistics extract lex phrase-table [inverse]\n"; exit(1); } char* &fileNameExtract = argv[1]; char* &fileNameLex = argv[2]; char* &fileNamePhraseTable = argv[3]; inverseFlag = false; if (argc > 4) { inverseFlag = true; cerr << "using inverse mode\n"; } // lexical translation table lexTable.load( fileNameLex ); // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastForeign = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string line; while(getline(extractFileP, line)) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; PhraseAlignment phrasePair; bool isPhrasePair = phrasePair.create( line.c_str(), i ); if (lastForeign >= 0 && lastForeign != phrasePair.foreign) { processPhrasePairs( phrasePairsWithSameF ); for(size_t j=0; j<phrasePairsWithSameF.size(); j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableE.clear(); phraseTableF.clear(); phrasePair.clear(); // process line again, since phrase tables flushed phrasePair.create( line.c_str(), i ); phrasePairBase = 0; } lastForeign = phrasePair.foreign; if (isPhrasePair) phrasePairsWithSameF.push_back( phrasePair ); else phrasePairBase++; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; for(int i=4;i<argc;i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts( fileNameExtract ); // sorted phrase extraction file ifstream extractFile; extractFile.open(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table phraseTableFile.open(fileNamePhraseTable); if (phraseTableFile.fail()) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (lastSource > 0 && strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i ); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair )) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF ); for(int j=0;j<phrasePairsWithSameF.size();j++) phrasePairsWithSameF[j].clear(); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create( line, i ); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.GetSource(); phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs( phrasePairsWithSameF ); phraseTableFile.close(); }
int main(int argc, char* argv[]) { cerr << "Score v2.5 written by Philipp Koehn" << endl << "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl << "scoring methods for extracted rules" << endl ; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--OnlyDirect] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--WordAlignment file]\n"; exit(1); } char* fileNameExtract = argv[1]; char* fileNameLex = argv[2]; char* fileNamePhraseTable = argv[3]; char* fileNameWordAlignment; for(int i=4; i<argc; ++i) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { onlyDirectFlag = true; cerr << "outputing in correct phrase table format (no merging with inverse)\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; fileNameWordAlignment = argv[++i]; cerr << "outputing word alignment in file " << fileNameWordAlignment << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; cerr << "using Good Turing discounting\n"; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load(fileNameLex); // compute count of counts for Good Turing discounting if (goodTuringFlag) computeCountOfCounts(fileNameExtract); // sorted phrase extraction file Bz2LineReader extractFile(fileNameExtract); // output file: phrase translation table Bz2LineWriter phraseTableFile(fileNamePhraseTable); // output word alignment file if (!inverseFlag && wordAlignmentFlag) { wordAlignmentFile.open(fileNameWordAlignment); if (wordAlignmentFile.fail()) { cerr << "ERROR: could not open word alignment file " << fileNameWordAlignment << endl; exit(1); } } // loop through all extracted phrase translations int lastSource = -1; vector< PhraseAlignment > phrasePairsWithSameF; int i=0; string lastLine = ""; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[p. score:" << i << "]" << flush; else if (i % 100000 == 0) cerr << "." << flush; // identical to last line? just add count if (lastSource >= 0 && line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment phrasePair; vector<string> lineVector = tokenize(line.c_str()); phrasePair.create(lineVector, i); // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals(phrasePair)) { lastPhrasePair->count += phrasePair.count; phrasePair.clear(); continue; } // if new source phrase, process last batch if (lastSource >= 0 && lastSource != phrasePair.source) { processPhrasePairs(phrasePairsWithSameF, phraseTableFile); for (size_t j=0; j<phrasePairsWithSameF.size(); phrasePairsWithSameF[j++].clear()); phrasePairsWithSameF.clear(); phraseTableT.clear(); phraseTableS.clear(); // process line again, since phrase tables flushed phrasePair.clear(); phrasePair.create(lineVector, i); } // add phrase pairs to list, it's now the last one lastSource = phrasePair.source; phrasePairsWithSameF.push_back(phrasePair); lastPhrasePair = &phrasePairsWithSameF[phrasePairsWithSameF.size()-1]; } processPhrasePairs(phrasePairsWithSameF, phraseTableFile); if (!inverseFlag && wordAlignmentFlag) wordAlignmentFile.close(); }
void computeCountOfCounts( char* fileNameExtract ) { cerr << "computing counts of counts"; for(int i=1;i<=GT_MAX;i++) countOfCounts[i] = 0; ifstream extractFile; extractFile.open( fileNameExtract ); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // loop through all extracted phrase translations int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->addToCount( line ); continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment *phrasePair = new PhraseAlignment(); phrasePair->create( line, i ); if (i == 1) { lastPhrasePair = phrasePair; continue; } // only differs in count? just add count if (lastPhrasePair->match( *phrasePair )) { lastPhrasePair->count += phrasePair->count; phrasePair->clear(); delete(phrasePair); continue; } // periodically house cleaning if (phrasePair->GetSource() != lastPhrasePair->GetSource()) { phraseTableT.clear(); // these would get too big phraseTableS.clear(); // these would get too big // process line again, since phrase tables flushed phrasePair->clear(); phrasePair->create( line, i ); } int count = lastPhrasePair->count + 0.99999; if(count <= GT_MAX) countOfCounts[ count ]++; lastPhrasePair->clear(); delete( lastPhrasePair ); lastPhrasePair = phrasePair; } delete lastPhrasePair; discountFactor[0] = 0.01; // floor cerr << "\n"; for(int i=1;i<GT_MAX;i++) { discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; // some smoothing... if (discountFactor[i]>1) discountFactor[i] = 1; if (discountFactor[i]<discountFactor[i-1]) discountFactor[i] = discountFactor[i-1]; cerr << " -> " << discountFactor[i]*i << endl; } }
void computeCountOfCounts(const string& fileNameExtract) { if (fileNameExtract == "-") { cerr << "The ‘GoodTuring Discounting’ option may not be used with piped input!" << endl; exit(9); } cerr << "computing counts of counts"; for (size_t i=1; i<=GT_MAX; countOfCounts[i++] = 0); Bz2LineReader extractFile(fileNameExtract); // loop through all extracted phrase translations int i=0; string lastLine; PhraseAlignment *lastPhrasePair = NULL; for (string line = extractFile.readLine(); !line.empty(); line = extractFile.readLine()) { if (line.empty()) break; if ((++i)%10000000 == 0) cerr << "[" << i << "]" << endl; else if (i % 100000 == 0) cerr << "," << flush; // identical to last line? just add count if (line == lastLine) { lastPhrasePair->addToCount(line); continue; } lastLine = line; // create new phrase pair PhraseAlignment *phrasePair = new PhraseAlignment(); vector<string> lineVector = tokenize(line.c_str()); phrasePair->create(lineVector, i); if (i == 1) { lastPhrasePair = phrasePair; continue; } // only differs in count? just add count if (lastPhrasePair->match( *phrasePair )) { lastPhrasePair->count += phrasePair->count; phrasePair->clear(); delete(phrasePair); continue; } // periodically house cleaning if (phrasePair->source != lastPhrasePair->source) { phraseTableT.clear(); // these would get too big phraseTableS.clear(); // these would get too big // process line again, since phrase tables flushed phrasePair->clear(); phrasePair->create(lineVector, i); } int count = lastPhrasePair->count + 0.99999; if(count <= GT_MAX) ++countOfCounts[ count ]; lastPhrasePair->clear(); delete( lastPhrasePair ); lastPhrasePair = phrasePair; } discountFactor[0] = 0.01; // floor cerr << "\n"; for(int i=1;i<GT_MAX; ++i) { discountFactor[i] = ((float)i+1)/(float)i*(((float)countOfCounts[i+1]+0.1) / ((float)countOfCounts[i]+0.1)); cerr << "count " << i << ": " << countOfCounts[ i ] << ", discount factor: " << discountFactor[i]; // some smoothing... if (discountFactor[i]>1) discountFactor[i] = 1; if (discountFactor[i]<discountFactor[i-1]) discountFactor[i] = discountFactor[i-1]; cerr << " -> " << discountFactor[i]*i << endl; } }