void writeCountOfCounts( const string &fileNameCountOfCounts ) { // open file Moses::OutputFileStream countOfCountsFile; bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str()); if (!success) { cerr << "ERROR: could not open count-of-counts file " << fileNameCountOfCounts << endl; return; } // Kneser-Ney needs the total number of phrase pairs countOfCountsFile << totalDistinct << endl; // write out counts for(int i=1; i<=COC_MAX; i++) { countOfCountsFile << countOfCounts[ i ] << endl; } countOfCountsFile.Close(); }
void processFiles( const std::string& fileNameDirect, const std::string& fileNameIndirect, const std::string& fileNameConsolidated, const std::string& fileNameCountOfCounts, const std::string& fileNameSourceLabelSet, const std::string& fileNamePartsOfSpeechVocabulary ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); // open input files Moses::InputFileStream fileDirect(fileNameDirect); UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect); // open output file: consolidated phrase table Moses::OutputFileStream fileConsolidated; bool success = fileConsolidated.Open(fileNameConsolidated); UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated); // create properties consolidator // (in case any additional phrase property requires further processing) MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator(); if (sourceLabelsFlag) { propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet); } if (partsOfSpeechFlag) { propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary); } // loop through all extracted phrase translations int i=0; while(true) { i++; if (i%100000 == 0) std::cerr << "." << std::flush; std::vector< std::string > itemDirect, itemIndirect; if (! getLine(fileIndirect, itemIndirect) || ! getLine(fileDirect, itemDirect)) break; // direct: target source alignment probabilities // indirect: source target probabilities // consistency checks UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0, "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'"); UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0, "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'"); // SCORES ... std::string directScores, directSparseScores, indirectScores, indirectSparseScores; breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores ); breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores ); std::vector<std::string> directCounts; Moses::Tokenize( directCounts, itemDirect[4] ); std::vector<std::string> indirectCounts; Moses::Tokenize( indirectCounts, itemIndirect[4] ); float countF = Moses::Scan<float>(directCounts[0]); float countE = Moses::Scan<float>(indirectCounts[0]); float countEF = Moses::Scan<float>(indirectCounts[1]); float n1_F, n1_E; if (kneserNeyFlag) { n1_F = Moses::Scan<float>(directCounts[2]); n1_E = Moses::Scan<float>(indirectCounts[2]); } // Good Turing discounting float adjustedCountEF = countEF; if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; float adjustedCountEF_indirect = adjustedCountEF; // Kneser Ney discounting [Foster et al, 2006] if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float alpha_F = D * n1_F / countF; // available mass adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; // for indirect float p_b_F = n1_F / totalCount; // target phrase prob based on distinct float alpha_E = D * n1_E / countE; // available mass adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; } // drop due to MinScore thresholding if ((minScore0 > 0 && adjustedCountEF_indirect/countE < minScore0) || (minScore2 > 0 && adjustedCountEF /countF < minScore2)) { continue; } // output phrase pair fileConsolidated << itemDirect[0] << " ||| "; if (partsOfSpeechFlag) { // write POS factor from property std::vector<std::string> targetTokens; Moses::Tokenize( targetTokens, itemDirect[1] ); std::vector<std::string> propertyValuePOS; propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS); size_t targetTerminalIndex = 0; for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin(); targetTokensIt!=targetTokens.end(); ++targetTokensIt) { fileConsolidated << *targetTokensIt; if (!isNonTerminal(*targetTokensIt)) { assert(propertyValuePOS.size() > targetTerminalIndex); fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex]; ++targetTerminalIndex; } fileConsolidated << " "; } fileConsolidated << "|||"; } else { fileConsolidated << itemDirect[1] << " |||"; } // prob indirect if (!onlyDirectFlag) { fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); fileConsolidated << " " << indirectScores; } // prob direct fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); fileConsolidated << " " << directScores; // phrase count feature if (phraseCountFlag) { fileConsolidated << " " << maybeLogProb(2.718); } // low count feature if (lowCountFlag) { fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF)); } // count bin feature (as a core feature) if (countBin.size()>0 && !sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " " << maybeLogProb(2.718); foundBin = true; } else { fileConsolidated << " " << maybeLogProb(1); } } fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); } // alignment fileConsolidated << " |||"; if (!itemDirect[2].empty()) { fileConsolidated << " " << itemDirect[2];; } // counts, for debugging fileConsolidated << " ||| " << countE << " " << countF << " " << countEF; // sparse features fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; if (indirectSparseScores.compare("") != 0) fileConsolidated << " " << indirectSparseScores; // count bin feature (as a sparse feature) if (sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " cb_"; if (i == 0 && countBin[i] > 1) fileConsolidated << "1_"; else if (i > 0 && countBin[i-1]+1 < countBin[i]) fileConsolidated << (countBin[i-1]+1) << "_"; fileConsolidated << countBin[i] << " 1"; foundBin = true; } } if (!foundBin) { fileConsolidated << " cb_max 1"; } } // arbitrary key-value pairs fileConsolidated << " |||"; if (itemDirect.size() >= 6) { propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated); } if (countsProperty) { fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}"; } fileConsolidated << std::endl; } fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); }
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); // open input files Moses::InputFileStream fileDirect(fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); if (fileDirect.fail()) { cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl; exit(1); } istream &fileDirectP = fileDirect; if (fileIndirect.fail()) { cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl; exit(1); } istream &fileIndirectP = fileIndirect; // open output file: consolidated phrase table Moses::OutputFileStream fileConsolidated; bool success = fileConsolidated.Open(fileNameConsolidated); if (!success) { cerr << "ERROR: could not open output file " << fileNameConsolidated << endl; exit(1); } // loop through all extracted phrase translations int i=0; while(true) { i++; if (i%100000 == 0) cerr << "." << flush; vector< string > itemDirect, itemIndirect; if (! getLine(fileIndirectP,itemIndirect) || ! getLine(fileDirectP, itemDirect )) break; // direct: target source alignment probabilities // indirect: source target probabilities // consistency checks if (itemDirect[0].compare( itemIndirect[0] ) != 0) { cerr << "ERROR: target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl; exit(1); } if (itemDirect[1].compare( itemIndirect[1] ) != 0) { cerr << "ERROR: source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl; exit(1); } // output hierarchical phrase pair (with separated labels) fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||"; // SCORES ... string directScores, directSparseScores, indirectScores, indirectSparseScores; breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores ); breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores ); vector<string> directCounts = tokenize(itemDirect[4].c_str()); vector<string> indirectCounts = tokenize(itemIndirect[4].c_str()); float countF = atof(directCounts[0].c_str()); float countE = atof(indirectCounts[0].c_str()); float countEF = atof(indirectCounts[1].c_str()); float n1_F, n1_E; if (kneserNeyFlag) { n1_F = atof(directCounts[2].c_str()); n1_E = atof(indirectCounts[2].c_str()); } // Good Turing discounting float adjustedCountEF = countEF; if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; float adjustedCountEF_indirect = adjustedCountEF; // Kneser Ney discounting [Foster et al, 2006] if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float alpha_F = D * n1_F / countF; // available mass adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; // for indirect float p_b_F = n1_F / totalCount; // target phrase prob based on distinct float alpha_E = D * n1_E / countE; // available mass adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; } // prob indirect if (!onlyDirectFlag) { fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); fileConsolidated << " " << indirectScores; } // prob direct fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); fileConsolidated << " " << directScores; // phrase count feature if (phraseCountFlag) { fileConsolidated << " " << maybeLogProb(2.718); } // low count feature if (lowCountFlag) { fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF)); } // count bin feature (as a core feature) if (countBin.size()>0 && !sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " " << maybeLogProb(2.718); foundBin = true; } else { fileConsolidated << " " << maybeLogProb(1); } } fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); } // alignment fileConsolidated << " ||| " << itemDirect[2]; // counts, for debugging fileConsolidated << "||| " << countE << " " << countF << " " << countEF; // count bin feature (as a sparse feature) fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; if (indirectSparseScores.compare("") != 0) fileConsolidated << " " << indirectSparseScores; if (sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " cb_"; if (i == 0 && countBin[i] > 1) fileConsolidated << "1_"; else if (i > 0 && countBin[i-1]+1 < countBin[i]) fileConsolidated << (countBin[i-1]+1) << "_"; fileConsolidated << countBin[i] << " 1"; foundBin = true; } } if (!foundBin) { fileConsolidated << " cb_max 1"; } } // arbitrary key-value pairs if (itemDirect.size() >= 6) { fileConsolidated << " ||| " << itemDirect[5]; } fileConsolidated << endl; } fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); }
int main(int argc, char* argv[]) { cerr << "extract-rules, written by Philipp Koehn\n" << "rule extraction from an aligned parallel corpus\n"; RuleExtractionOptions options; int sentenceOffset = 0; #ifdef WITH_THREADS int thread_count = 1; #endif if (argc < 5) { cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract [" << " --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --MaxSpan[" << options.maxSpan << "]" << " | --MinHoleTarget[" << options.minHoleTarget << "]" << " | --MinHoleSource[" << options.minHoleSource << "]" << " | --MinWords[" << options.minWords << "]" << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]" << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]" << " | --MaxNonTerm[" << options.maxNonTerm << "]" << " | --MaxScope[" << options.maxScope << "]" << " | --SourceSyntax | --TargetSyntax" << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting" << " | --UnpairedExtractFormat" << " | --ConditionOnTargetLHS ]" << " | --BoundaryRules[" << options.boundaryRules << "]" << " | --FlexibilityScore\n"; exit(1); } char* &fileNameT = argv[1]; char* &fileNameS = argv[2]; char* &fileNameA = argv[3]; string fileNameGlueGrammar; string fileNameUnknownWordLabel; string fileNameExtract = string(argv[4]); int optionInd = 5; for(int i=optionInd; i<argc; i++) { // maximum span length if (strcmp(argv[i],"--MaxSpan") == 0) { options.maxSpan = atoi(argv[++i]); if (options.maxSpan < 1) { cerr << "extract error: --maxSpan should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleTarget") == 0) { options.minHoleTarget = atoi(argv[++i]); if (options.minHoleTarget < 1) { cerr << "extract error: --minHoleTarget should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleSource") == 0) { options.minHoleSource = atoi(argv[++i]); if (options.minHoleSource < 1) { cerr << "extract error: --minHoleSource should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) { options.maxSymbolsTarget = atoi(argv[++i]); if (options.maxSymbolsTarget < 1) { cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { options.maxSymbolsSource = atoi(argv[++i]); if (options.maxSymbolsSource < 1) { cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; exit(1); } } // minimum number of words in hierarchical phrase else if (strcmp(argv[i],"--MinWords") == 0) { options.minWords = atoi(argv[++i]); if (options.minWords < 0) { cerr << "extract error: --MinWords should be at least 0" << endl; exit(1); } } // maximum number of non-terminals else if (strcmp(argv[i],"--MaxNonTerm") == 0) { options.maxNonTerm = atoi(argv[++i]); if (options.maxNonTerm < 1) { cerr << "extract error: --MaxNonTerm should be at least 1" << endl; exit(1); } } // maximum scope (see Hopkins and Langmead (2010)) else if (strcmp(argv[i],"--MaxScope") == 0) { options.maxScope = atoi(argv[++i]); if (options.maxScope < 0) { cerr << "extract error: --MaxScope should be at least 0" << endl; exit(1); } } else if (strcmp(argv[i], "--GZOutput") == 0) { options.gzOutput = true; } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { options.targetSyntax = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { options.sourceSyntax = true; } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) { options.requireAlignedWord = false; } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) { options.nonTermConsecTarget = false; } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) { options.nonTermConsecSource = true; } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) { options.nonTermFirstWord = false; } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { options.onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { options.onlyDirectFlag = true; } else if (strcmp(argv[i],"--GlueGrammar") == 0) { options.glueGrammarFlag = true; if (++i >= argc) { cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; exit(0); } fileNameGlueGrammar = string(argv[i]); cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { options.unknownWordLabelFlag = true; if (++i >= argc) { cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; exit(0); } fileNameUnknownWordLabel = string(argv[i]); cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; } // TODO: this should be a useful option //else if (strcmp(argv[i],"--ZipFiles") == 0) { // zipFiles = true; //} // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; } else if (strcmp(argv[i],"--PCFG") == 0) { options.pcfgScore = true; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { options.unpairedExtractFormat = true; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { options.conditionOnTargetLhs = true; } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.flexScoreFlag = true; } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || strcmp(argv[i],"--Threads") == 0) { #ifdef WITH_THREADS thread_count = atoi(argv[++i]); #else cerr << "thread support not compiled in." << '\n'; exit(1); #endif } else if (strcmp(argv[i], "--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; exit(1); } sentenceOffset = atoi(argv[++i]); } else if (strcmp(argv[i],"--BoundaryRules") == 0) { options.boundaryRules = true; } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } cerr << "extracting hierarchical rules" << endl; // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); Moses::InputFileStream aFile(fileNameA); istream *tFileP = &tFile; istream *sFileP = &sFile; istream *aFileP = &aFile; // open output files string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":""); Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; Moses::OutputFileStream extractFileContext; Moses::OutputFileStream extractFileContextInv; extractFile.Open((fileNameExtract + (options.gzOutput?".gz":"")).c_str()); if (!options.onlyDirectFlag) extractFileInv.Open(fileNameExtractInv.c_str()); if (options.flexScoreFlag) { string fileNameExtractContext = fileNameExtract + ".context" + (options.gzOutput?".gz":""); extractFileContext.Open(fileNameExtractContext.c_str()); if (!options.onlyDirectFlag) { string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.gzOutput?".gz":""); extractFileContextInv.Open(fileNameExtractContextInv.c_str()); } } // stats on labels for glue grammar and unknown word label probabilities set< string > targetLabelCollection, sourceLabelCollection; map< string, int > targetTopLabelCollection, sourceTopLabelCollection; // loop through all sentence pairs size_t i=sentenceOffset; string targetString, sourceString, alignmentString; while(getline(*tFileP, targetString)) { i++; getline(*sFileP, sourceString); getline(*aFileP, alignmentString); if (i%1000 == 0) cerr << i << " " << flush; SentenceAlignmentWithSyntax sentence (targetLabelCollection, sourceLabelCollection, targetTopLabelCollection, sourceTopLabelCollection, options); //az: output src, tgt, and alingment line if (options.onlyOutputSpanInfo) { cout << "LOG: SRC: " << sourceString << endl; cout << "LOG: TGT: " << targetString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) { if (options.unknownWordLabelFlag) { collectWordLabelCounts(sentence); } ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv, extractFileContext, extractFileContextInv); task->Run(); delete task; } if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } tFile.Close(); sFile.Close(); aFile.Close(); // only close if we actually opened it if (!options.onlyOutputSpanInfo) { extractFile.Close(); if (!options.onlyDirectFlag) extractFileInv.Close(); } if (options.flexScoreFlag) { extractFileContext.Close(); if (!options.onlyDirectFlag) extractFileContextInv.Close(); } if (options.glueGrammarFlag) writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection); if (options.unknownWordLabelFlag) writeUnknownWordLabel(fileNameUnknownWordLabel); }
int main(int argc, char* argv[]) { cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n"; exit(1); } char* &fileNameE = argv[1]; char* &fileNameF = argv[2]; char* &fileNameA = argv[3]; string fileNameExtract = string(argv[4]); maxPhraseLength = atoi(argv[5]); for(int i=6; i<argc; i++) { if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { orientationFlag = true; } else if (strcmp(argv[i],"--NoTTable") == 0) { translationFlag = false; } else if (strcmp(argv[i], "--SentenceId") == 0) { sentenceIdFlag = true; } else if (strcmp(argv[i], "--GZOutput") == 0) { gzOutput = true; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; exit(1); } char* modelParams = argv[++i]; char* modelName = strtok(modelParams, "-"); char* modelType = strtok(NULL, "-"); REO_MODEL_TYPE intModelType; if(strcmp(modelName, "wbe") == 0) { wordModel = true; if(strcmp(modelType, "msd") == 0) wordType = REO_MSD; else if(strcmp(modelType, "mslr") == 0) wordType = REO_MSLR; else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) wordType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "phrase") == 0) { phraseModel = true; if(strcmp(modelType, "msd") == 0) phraseType = REO_MSD; else if(strcmp(modelType, "mslr") == 0) phraseType = REO_MSLR; else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) phraseType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "hier") == 0) { hierModel = true; if(strcmp(modelType, "msd") == 0) hierType = REO_MSD; else if(strcmp(modelType, "mslr") == 0) hierType = REO_MSLR; else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) hierType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else { cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; exit(1); } allModelsOutputFlag = true; } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } // default reordering model if no model selected // allows for the old syntax to be used if(orientationFlag && !allModelsOutputFlag) { wordModel = true; wordType = REO_MSD; } // open input files Moses::InputFileStream eFile(fileNameE); Moses::InputFileStream fFile(fileNameF); Moses::InputFileStream aFile(fileNameA); istream *eFileP = &eFile; istream *fFileP = &fFile; istream *aFileP = &aFile; // open output files if (translationFlag) { string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":""); extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str()); extractFileInv.Open(fileNameExtractInv.c_str()); } if (orientationFlag) { string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } if (sentenceIdFlag) { string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":""); extractFileSentenceId.Open(fileNameExtractSentenceId.c_str()); } int i=0; while(true) { i++; if (i%10000 == 0) cerr << "." << flush; char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (onlyOutputSpanInfo) { cout << "LOG: SRC: " << foreignString << endl; cout << "LOG: TGT: " << englishString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, i)) { extract(sentence); } if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } eFile.Close(); fFile.Close(); aFile.Close(); //az: only close if we actually opened it if (!onlyOutputSpanInfo) { if (translationFlag) { extractFile.Close(); extractFileInv.Close(); } if (orientationFlag) extractFileOrientation.Close(); if (sentenceIdFlag) { extractFileSentenceId.Close(); } } }
int main(int argc, char* argv[]) { cerr << "extract-rules, written by Philipp Koehn\n" << "rule extraction from an aligned parallel corpus\n"; RuleExtractionOptions options; #ifdef WITH_THREADS int thread_count = 1; #endif if (argc < 5) { cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract [" #ifdef WITH_THREADS << " --threads NUM |" #endif << " --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --OutputNTLengths" << " | --MaxSpan[" << options.maxSpan << "]" << " | --MinHoleTarget[" << options.minHoleTarget << "]" << " | --MinHoleSource[" << options.minHoleSource << "]" << " | --MinWords[" << options.minWords << "]" << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]" << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]" << " | --MaxNonTerm[" << options.maxNonTerm << "]" << " | --MaxScope[" << options.maxScope << "]" << " | --SourceSyntax | --TargetSyntax" << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting ]\n"; exit(1); } char* &fileNameT = argv[1]; char* &fileNameS = argv[2]; char* &fileNameA = argv[3]; string fileNameGlueGrammar; string fileNameUnknownWordLabel; string fileNameExtract = string(argv[4]); int optionInd = 5; for(int i=optionInd; i<argc; i++) { // maximum span length if (strcmp(argv[i],"--MaxSpan") == 0) { options.maxSpan = atoi(argv[++i]); if (options.maxSpan < 1) { cerr << "extract error: --maxSpan should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleTarget") == 0) { options.minHoleTarget = atoi(argv[++i]); if (options.minHoleTarget < 1) { cerr << "extract error: --minHoleTarget should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleSource") == 0) { options.minHoleSource = atoi(argv[++i]); if (options.minHoleSource < 1) { cerr << "extract error: --minHoleSource should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) { options.maxSymbolsTarget = atoi(argv[++i]); if (options.maxSymbolsTarget < 1) { cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { options.maxSymbolsSource = atoi(argv[++i]); if (options.maxSymbolsSource < 1) { cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; exit(1); } } // minimum number of words in hierarchical phrase else if (strcmp(argv[i],"--MinWords") == 0) { options.minWords = atoi(argv[++i]); if (options.minWords < 0) { cerr << "extract error: --MinWords should be at least 0" << endl; exit(1); } } // maximum number of non-terminals else if (strcmp(argv[i],"--MaxNonTerm") == 0) { options.maxNonTerm = atoi(argv[++i]); if (options.maxNonTerm < 1) { cerr << "extract error: --MaxNonTerm should be at least 1" << endl; exit(1); } } // maximum scope (see Hopkins and Langmead (2010)) else if (strcmp(argv[i],"--MaxScope") == 0) { options.maxScope = atoi(argv[++i]); if (options.maxScope < 0) { cerr << "extract error: --MaxScope should be at least 0" << endl; exit(1); } } else if (strcmp(argv[i], "--GZOutput") == 0) { options.gzOutput = true; } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { options.targetSyntax = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { options.sourceSyntax = true; } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) { options.requireAlignedWord = false; } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) { options.nonTermConsecTarget = false; } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) { options.nonTermConsecSource = true; } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) { options.nonTermFirstWord = false; } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { options.onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { options.onlyDirectFlag = true; } else if (strcmp(argv[i],"--GlueGrammar") == 0) { options.glueGrammarFlag = true; if (++i >= argc) { cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; exit(0); } fileNameGlueGrammar = string(argv[i]); cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { options.unknownWordLabelFlag = true; if (++i >= argc) { cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; exit(0); } fileNameUnknownWordLabel = string(argv[i]); cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; } // TODO: this should be a useful option //else if (strcmp(argv[i],"--ZipFiles") == 0) { // zipFiles = true; //} // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { options.outputNTLengths = true; #ifdef WITH_THREADS } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || strcmp(argv[i],"--Threads") == 0) { thread_count = atoi(argv[++i]); #endif } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } cerr << "extracting hierarchical rules" << endl; // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); Moses::InputFileStream aFile(fileNameA); istream *tFileP = &tFile; istream *sFileP = &sFile; istream *aFileP = &aFile; // open output files string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":""); Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; extractFile.Open((fileNameExtract + (options.gzOutput?".gz":"")).c_str()); if (!options.onlyDirectFlag) extractFileInv.Open(fileNameExtractInv.c_str()); // output into file Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile); Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv); // stats on labels for glue grammar and unknown word label probabilities set< string > targetLabelCollection, sourceLabelCollection; map< string, int > targetTopLabelCollection, sourceTopLabelCollection; #ifdef WITH_THREADS // set up thread pool Moses::ThreadPool pool(thread_count); pool.SetQueueLimit(1000); #endif // loop through all sentence pairs size_t i=0; while(true) { i++; if (i%1000 == 0) cerr << "." << flush; if (i%10000 == 0) cerr << ":" << flush; if (i%100000 == 0) cerr << "!" << flush; char targetString[LINE_MAX_LENGTH]; char sourceString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__); if (tFileP->eof()) break; SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax (targetLabelCollection, sourceLabelCollection, targetTopLabelCollection, sourceTopLabelCollection, options); //az: output src, tgt, and alingment line if (options.onlyOutputSpanInfo) { cout << "LOG: SRC: " << sourceString << endl; cout << "LOG: TGT: " << targetString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence->create(targetString, sourceString, alignmentString, i)) { if (options.unknownWordLabelFlag) { collectWordLabelCounts(*sentence); } ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv); #ifdef WITH_THREADS if (thread_count == 1) { task->Run(); delete task; } else { pool.Submit(task); } #else task->Run(); delete task; #endif } if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } #ifdef WITH_THREADS // wait for all threads to finish pool.Stop(true); #endif tFile.Close(); sFile.Close(); aFile.Close(); // only close if we actually opened it if (!options.onlyOutputSpanInfo) { extractFile.Close(); if (!options.onlyDirectFlag) extractFileInv.Close(); } if (options.glueGrammarFlag) writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection); if (options.unknownWordLabelFlag) writeUnknownWordLabel(fileNameUnknownWordLabel); }
int main(int argc, char* argv[]) { cerr << "Extract v2.0, written by Philipp Koehn\n" << "rule extraction from an aligned parallel corpus\n"; //time_t starttime = time(NULL); Global *global = new Global(); g_global = global; int sentenceOffset = 0; if (argc < 5) { cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract " << " [ --Hierarchical | --Orientation" << " | --GlueGrammar FILE | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]" << " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]" << " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]" << " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]" << " | --MaxSymbols[" << global->maxSymbols<< "]" << " | --MaxNonTerm[" << global->maxNonTerm << "]" << " | --SourceSyntax | --TargetSyntax" << " | --UppermostOnly[" << g_global->uppermostOnly << "]" << endl; exit(1); } char* &fileNameT = argv[1]; char* &fileNameS = argv[2]; char* &fileNameA = argv[3]; string fileNameGlueGrammar; string fileNameUnknownWordLabel; string fileNameExtract = string(argv[4]); int optionInd = 5; for(int i=optionInd;i<argc;i++) { if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) { global->minHoleSpanSourceDefault = atoi(argv[++i]); if (global->minHoleSpanSourceDefault < 1) { cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) { global->maxHoleSpanSourceDefault = atoi(argv[++i]); if (global->maxHoleSpanSourceDefault < 1) { cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) { global->minHoleSpanSourceSyntax = atoi(argv[++i]); if (global->minHoleSpanSourceSyntax < 1) { cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--UppermostOnly") == 0) { global->uppermostOnly = atoi(argv[++i]); } else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) { global->maxHoleSpanSourceSyntax = atoi(argv[++i]); if (global->maxHoleSpanSourceSyntax < 1) { cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--maxSymbols") == 0) { global->maxSymbols = atoi(argv[++i]); if (global->maxSymbols < 1) { cerr << "extract error: --maxSymbols should be at least 1" << endl; exit(1); } } // maximum number of non-terminals else if (strcmp(argv[i],"--MaxNonTerm") == 0) { global->maxNonTerm = atoi(argv[++i]); if (global->maxNonTerm < 1) { cerr << "extract error: --MaxNonTerm should be at least 1" << endl; exit(1); } } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { global->targetSyntax = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { global->sourceSyntax = true; } // do not create many part00xx files! else if (strcmp(argv[i],"--NoFileLimit") == 0) { // now default } else if (strcmp(argv[i],"--GlueGrammar") == 0) { global->glueGrammarFlag = true; if (++i >= argc) { cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; exit(0); } fileNameGlueGrammar = string(argv[i]); cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { global->unknownWordLabelFlag = true; if (++i >= argc) { cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; exit(0); } fileNameUnknownWordLabel = string(argv[i]); cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; } // TODO: this should be a useful option //else if (strcmp(argv[i],"--ZipFiles") == 0) { // zipFiles = true; //} // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--Mixed") == 0) { global->mixed = true; } else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) { global->allowDefaultNonTermEdge = atoi(argv[++i]); } else if (strcmp(argv[i], "--GZOutput") == 0) { global->gzOutput = true; } else if (strcmp(argv[i],"--MaxSpan") == 0) { // ignore ++i; } else if (strcmp(argv[i],"--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; exit(1); } sentenceOffset = atoi(argv[++i]); } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); Moses::InputFileStream aFile(fileNameA); // open output files string fileNameExtractInv = fileNameExtract + ".inv"; if (global->gzOutput) { fileNameExtract += ".gz"; fileNameExtractInv += ".gz"; } Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; extractFile.Open(fileNameExtract.c_str()); extractFileInv.Open(fileNameExtractInv.c_str()); // loop through all sentence pairs int i = sentenceOffset; while(true) { i++; if (i % 1000 == 0) { cerr << i << " " << flush; } string targetString; string sourceString; string alignmentString; bool ok = getline(tFile, targetString); if (!ok) break; getline(sFile, sourceString); getline(aFile, alignmentString); //cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl; //time_t currTime = time(NULL); //cerr << "A " << (currTime - starttime) << endl; SentenceAlignment sentencePair; if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) { //cerr << sentence.sourceTree << endl; //cerr << sentence.targetTree << endl; sentencePair.FindTunnels(*g_global); //cerr << "C " << (time(NULL) - starttime) << endl; //cerr << sentencePair << endl; sentencePair.CreateLattice(*g_global); //cerr << "D " << (time(NULL) - starttime) << endl; //cerr << sentencePair << endl; sentencePair.CreateRules(*g_global); //cerr << "E " << (time(NULL) - starttime) << endl; //cerr << sentence.lattice->GetRules().GetSize() << endl; sentencePair.GetLattice().GetRules().Output(extractFile); sentencePair.GetLattice().GetRules().OutputInv(extractFileInv); } } tFile.Close(); sFile.Close(); aFile.Close(); extractFile.Close(); extractFileInv.Close(); if (global->glueGrammarFlag) { writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection); } delete global; }
int main(int argc, char* argv[]) { cerr << "Score v2.0 written by Philipp Koehn\n" << "scoring methods for extracted rules\n"; if (argc < 4) { cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n"; exit(1); } string fileNameExtract = argv[1]; string fileNameLex = argv[2]; string fileNamePhraseTable = argv[3]; string fileNameCountOfCounts; char* fileNameFunctionWords = NULL; char* fileNameDomain = NULL; for(int i=4; i<argc; i++) { if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) { inverseFlag = true; cerr << "using inverse mode\n"; } else if (strcmp(argv[i],"--Hierarchical") == 0) { hierarchicalFlag = true; cerr << "processing hierarchical rules\n"; } else if (strcmp(argv[i],"--PCFG") == 0) { pcfgFlag = true; cerr << "including PCFG scores\n"; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { unpairedExtractFormatFlag = true; cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { conditionOnTargetLhsFlag = true; cerr << "processing unpaired extract format\n"; } else if (strcmp(argv[i],"--WordAlignment") == 0) { wordAlignmentFlag = true; cerr << "outputing word alignment" << endl; } else if (strcmp(argv[i],"--NoLex") == 0) { lexFlag = false; cerr << "not computing lexical translation score\n"; } else if (strcmp(argv[i],"--GoodTuring") == 0) { goodTuringFlag = true; fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Good Turing discounting\n"; } else if (strcmp(argv[i],"--KneserNey") == 0) { kneserNeyFlag = true; fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc"; cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n"; } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) { unalignedFlag = true; cerr << "using unaligned word penalty\n"; } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) { unalignedFWFlag = true; if (i+1==argc) { cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n"; exit(1); } fileNameFunctionWords = argv[++i]; cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl; } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 || strcmp(argv[i],"--SparseDomainRatio") == 0 || strcmp(argv[i],"--SparseDomainSubset") == 0 || strcmp(argv[i],"--DomainIndicator") == 0 || strcmp(argv[i],"--DomainRatio") == 0 || strcmp(argv[i],"--DomainSubset") == 0) { includeSentenceIdFlag = true; domainFlag = true; domainSparseFlag = strstr( argv[i], "Sparse" ); domainRatioFlag = strstr( argv[i], "Ratio" ); domainSubsetFlag = strstr( argv[i], "Subset" ); if (i+1==argc) { cerr << "ERROR: specify domain info file with " << argv[i] << endl; exit(1); } fileNameDomain = argv[++i]; } else if (strcmp(argv[i],"--LogProb") == 0) { logProbFlag = true; cerr << "using log-probabilities\n"; } else if (strcmp(argv[i],"--NegLogProb") == 0) { logProbFlag = true; negLogProb = -1; cerr << "using negative log-probabilities\n"; } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) { minCountHierarchical = atof(argv[++i]); cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n"; minCountHierarchical -= 0.00001; // account for rounding } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { outputNTLengths = true; } else if (strcmp(argv[i],"--Singleton") == 0) { singletonFeature = true; cerr << "binary singleton feature\n"; } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) { crossedNonTerm = true; cerr << "crossed non-term reordering feature\n"; } else { cerr << "ERROR: unknown option " << argv[i] << endl; exit(1); } } // lexical translation table if (lexFlag) lexTable.load( fileNameLex ); // function word list if (unalignedFWFlag) loadFunctionWords( fileNameFunctionWords ); // load domain information if (domainFlag) { if (inverseFlag) { domainFlag = false; includeSentenceIdFlag = false; } else { domain = new Domain; domain->load( fileNameDomain ); } } // compute count of counts for Good Turing discounting if (goodTuringFlag || kneserNeyFlag) { for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0; } // sorted phrase extraction file Moses::InputFileStream extractFile(fileNameExtract); if (extractFile.fail()) { cerr << "ERROR: could not open extract file " << fileNameExtract << endl; exit(1); } istream &extractFileP = extractFile; // output file: phrase translation table ostream *phraseTableFile; if (fileNamePhraseTable == "-") { phraseTableFile = &cout; } else { Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); bool success = outputFile->Open(fileNamePhraseTable); if (!success) { cerr << "ERROR: could not open file phrase table file " << fileNamePhraseTable << endl; exit(1); } phraseTableFile = outputFile; } // loop through all extracted phrase translations float lastCount = 0.0f; float lastPcfgSum = 0.0f; vector< PhraseAlignment > phrasePairsWithSameF; bool isSingleton = true; int i=0; char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH]; lastLine[0] = '\0'; PhraseAlignment *lastPhrasePair = NULL; while(true) { if (extractFileP.eof()) break; if (++i % 100000 == 0) cerr << "." << flush; SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__); if (extractFileP.eof()) break; // identical to last line? just add count if (strcmp(line,lastLine) == 0) { lastPhrasePair->count += lastCount; lastPhrasePair->pcfgSum += lastPcfgSum; continue; } strcpy( lastLine, line ); // create new phrase pair PhraseAlignment phrasePair; phrasePair.create( line, i, includeSentenceIdFlag ); lastCount = phrasePair.count; lastPcfgSum = phrasePair.pcfgSum; // only differs in count? just add count if (lastPhrasePair != NULL && lastPhrasePair->equals( phrasePair ) && (!domainFlag || domain->getDomainOfSentence( lastPhrasePair->sentenceId ) == domain->getDomainOfSentence( phrasePair.sentenceId ) )) { lastPhrasePair->count += phrasePair.count; lastPhrasePair->pcfgSum += phrasePair.pcfgSum; continue; } // if new source phrase, process last batch if (lastPhrasePair != NULL && lastPhrasePair->GetSource() != phrasePair.GetSource()) { processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phrasePairsWithSameF.clear(); isSingleton = false; lastPhrasePair = NULL; } else { isSingleton = true; } // add phrase pairs to list, it's now the last one phrasePairsWithSameF.push_back( phrasePair ); lastPhrasePair = &phrasePairsWithSameF.back(); } processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton ); phraseTableFile->flush(); if (phraseTableFile != &cout) { delete phraseTableFile; } // output count of count statistics if (goodTuringFlag || kneserNeyFlag) { writeCountOfCounts( fileNameCountOfCounts ); } }
int main(int argc, char* argv[]) { cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; exit(1); } Moses::OutputFileStream extractFileOrientation; const char* const &fileNameE = argv[1]; const char* const &fileNameF = argv[2]; const char* const &fileNameA = argv[3]; const string fileNameExtract = string(argv[4]); PhraseExtractionOptions options(atoi(argv[5])); for(int i=6; i<argc; i++) { if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { options.initOnlyOutputSpanInfo(true); } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { options.initOrientationFlag(true); } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.initFlexScoreFlag(true); } else if (strcmp(argv[i],"--NoTTable") == 0) { options.initTranslationFlag(false); } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) { options.initIncludeSentenceIdFlag(true); } else if (strcmp(argv[i], "--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; exit(1); } sentenceOffset = atoi(argv[++i]); } else if (strcmp(argv[i], "--GZOutput") == 0) { options.initGzOutput(true); } else if (strcmp(argv[i], "--InstanceWeights") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl; exit(1); } options.initInstanceWeightsFile(argv[++i]); } else if (strcmp(argv[i], "--Debug") == 0) { options.debug = true; } else if (strcmp(argv[i], "--MinPhraseLength") == 0) { options.minPhraseLength = atoi(argv[++i]); } else if (strcmp(argv[i], "--Separator") == 0) { options.separator = argv[++i]; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; exit(1); } char* modelParams = argv[++i]; char* modelName = strtok(modelParams, "-"); char* modelType = strtok(NULL, "-"); // REO_MODEL_TYPE intModelType; if(strcmp(modelName, "wbe") == 0) { options.initWordModel(true); if(strcmp(modelType, "msd") == 0) options.initWordType(REO_MSD); else if(strcmp(modelType, "mslr") == 0) options.initWordType(REO_MSLR); else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) options.initWordType(REO_MONO); else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "phrase") == 0) { options.initPhraseModel(true); if(strcmp(modelType, "msd") == 0) options.initPhraseType(REO_MSD); else if(strcmp(modelType, "mslr") == 0) options.initPhraseType(REO_MSLR); else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) options.initPhraseType(REO_MONO); else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "hier") == 0) { options.initHierModel(true); if(strcmp(modelType, "msd") == 0) options.initHierType(REO_MSD); else if(strcmp(modelType, "mslr") == 0) options.initHierType(REO_MSLR); else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) options.initHierType(REO_MONO); else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else { cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; exit(1); } options.initAllModelsOutputFlag(true); } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } // default reordering model if no model selected // allows for the old syntax to be used if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) { options.initWordModel(true); options.initWordType(REO_MSD); } // open input files Moses::InputFileStream eFile(fileNameE); Moses::InputFileStream fFile(fileNameF); Moses::InputFileStream aFile(fileNameA); istream *eFileP = &eFile; istream *fFileP = &fFile; istream *aFileP = &aFile; istream *iwFileP = NULL; auto_ptr<Moses::InputFileStream> instanceWeightsFile; if (options.getInstanceWeightsFile().length()) { instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile())); iwFileP = instanceWeightsFile.get(); } // open output files if (options.isOrientationFlag()) { string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } int i = sentenceOffset; while(true) { i++; if (i%10000 == 0) cerr << "." << flush; char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; char weightString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); if (iwFileP) { SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); } SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (options.isOnlyOutputSpanInfo()) { cout << "LOG: SRC: " << foreignString << endl; cout << "LOG: TGT: " << englishString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation); task->Run(); delete task; } if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } eFile.Close(); fFile.Close(); aFile.Close(); //az: only close if we actually opened it if (!options.isOnlyOutputSpanInfo()) { if (options.isOrientationFlag()) { extractFileOrientation.Close(); } } }
int main(int argc, char* argv[]) { cerr << "Starting..." << endl; char* &fileNameDirect = argv[1]; Moses::InputFileStream fileDirect(fileNameDirect); //fileDirect.open(fileNameDirect); if (fileDirect.fail()) { cerr << "ERROR: could not open extract file " << fileNameDirect << endl; exit(1); } istream &fileDirectP = fileDirect; char* &fileNameConsolidated = argv[2]; ostream *fileConsolidated; if (strcmp(fileNameConsolidated, "-") == 0) { fileConsolidated = &cout; } else { Moses::OutputFileStream *outputFile = new Moses::OutputFileStream(); bool success = outputFile->Open(fileNameConsolidated); if (!success) { cerr << "ERROR: could not open file phrase table file " << fileNameConsolidated << endl; exit(1); } fileConsolidated = outputFile; } int i=0; while(true) { i++; if (i%1000 == 0) cerr << "." << flush; if (i%10000 == 0) cerr << ":" << flush; if (i%100000 == 0) cerr << "!" << flush; vector< string > itemDirect; if (! getLine(fileDirectP, itemDirect )) break; (*fileConsolidated) << itemDirect[0] << " ||| " << itemDirect[1] << " ||| "; // output alignment and probabilities (*fileConsolidated) << itemDirect[2] // prob direct << " 2.718" // phrase count feature << " ||| " << itemDirect[3]; // alignment // counts (*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect (*fileConsolidated) << endl; } fileConsolidated->flush(); if (fileConsolidated != &cout) { delete fileConsolidated; } cerr << "Finished" << endl; }