int main(int argc, char* argv[]) { cerr << "extract-rules, written by Philipp Koehn\n" << "rule extraction from an aligned parallel corpus\n"; RuleExtractionOptions options; int sentenceOffset = 0; #ifdef WITH_THREADS int thread_count = 1; #endif if (argc < 5) { cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract [" << " --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --MaxSpan[" << options.maxSpan << "]" << " | --MinHoleTarget[" << options.minHoleTarget << "]" << " | --MinHoleSource[" << options.minHoleSource << "]" << " | --MinWords[" << options.minWords << "]" << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]" << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]" << " | --MaxNonTerm[" << options.maxNonTerm << "]" << " | --MaxScope[" << options.maxScope << "]" << " | --SourceSyntax | --TargetSyntax" << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting" << " | --UnpairedExtractFormat" << " | --ConditionOnTargetLHS ]" << " | --BoundaryRules[" << options.boundaryRules << "]" << " | --FlexibilityScore\n"; exit(1); } char* &fileNameT = argv[1]; char* &fileNameS = argv[2]; char* &fileNameA = argv[3]; string fileNameGlueGrammar; string fileNameUnknownWordLabel; string fileNameExtract = string(argv[4]); int optionInd = 5; for(int i=optionInd; i<argc; i++) { // maximum span length if (strcmp(argv[i],"--MaxSpan") == 0) { options.maxSpan = atoi(argv[++i]); if (options.maxSpan < 1) { cerr << "extract error: --maxSpan should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleTarget") == 0) { options.minHoleTarget = atoi(argv[++i]); if (options.minHoleTarget < 1) { cerr << "extract error: --minHoleTarget should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleSource") == 0) { options.minHoleSource = atoi(argv[++i]); if (options.minHoleSource < 1) { cerr << "extract error: --minHoleSource should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) { options.maxSymbolsTarget = atoi(argv[++i]); if (options.maxSymbolsTarget < 1) { cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { options.maxSymbolsSource = atoi(argv[++i]); if (options.maxSymbolsSource < 1) { cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; exit(1); } } // minimum number of words in hierarchical phrase else if (strcmp(argv[i],"--MinWords") == 0) { options.minWords = atoi(argv[++i]); if (options.minWords < 0) { cerr << "extract error: --MinWords should be at least 0" << endl; exit(1); } } // maximum number of non-terminals else if (strcmp(argv[i],"--MaxNonTerm") == 0) { options.maxNonTerm = atoi(argv[++i]); if (options.maxNonTerm < 1) { cerr << "extract error: --MaxNonTerm should be at least 1" << endl; exit(1); } } // maximum scope (see Hopkins and Langmead (2010)) else if (strcmp(argv[i],"--MaxScope") == 0) { options.maxScope = atoi(argv[++i]); if (options.maxScope < 0) { cerr << "extract error: --MaxScope should be at least 0" << endl; exit(1); } } else if (strcmp(argv[i], "--GZOutput") == 0) { options.gzOutput = true; } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { options.targetSyntax = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { options.sourceSyntax = true; } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) { options.requireAlignedWord = false; } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) { options.nonTermConsecTarget = false; } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) { options.nonTermConsecSource = true; } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) { options.nonTermFirstWord = false; } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { options.onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { options.onlyDirectFlag = true; } else if (strcmp(argv[i],"--GlueGrammar") == 0) { options.glueGrammarFlag = true; if (++i >= argc) { cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; exit(0); } fileNameGlueGrammar = string(argv[i]); cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { options.unknownWordLabelFlag = true; if (++i >= argc) { cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; exit(0); } fileNameUnknownWordLabel = string(argv[i]); cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; } // TODO: this should be a useful option //else if (strcmp(argv[i],"--ZipFiles") == 0) { // zipFiles = true; //} // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; } else if (strcmp(argv[i],"--PCFG") == 0) { options.pcfgScore = true; } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) { options.unpairedExtractFormat = true; } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) { options.conditionOnTargetLhs = true; } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.flexScoreFlag = true; } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || strcmp(argv[i],"--Threads") == 0) { #ifdef WITH_THREADS thread_count = atoi(argv[++i]); #else cerr << "thread support not compiled in." << '\n'; exit(1); #endif } else if (strcmp(argv[i], "--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; exit(1); } sentenceOffset = atoi(argv[++i]); } else if (strcmp(argv[i],"--BoundaryRules") == 0) { options.boundaryRules = true; } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } cerr << "extracting hierarchical rules" << endl; // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); Moses::InputFileStream aFile(fileNameA); istream *tFileP = &tFile; istream *sFileP = &sFile; istream *aFileP = &aFile; // open output files string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":""); Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; Moses::OutputFileStream extractFileContext; Moses::OutputFileStream extractFileContextInv; extractFile.Open((fileNameExtract + (options.gzOutput?".gz":"")).c_str()); if (!options.onlyDirectFlag) extractFileInv.Open(fileNameExtractInv.c_str()); if (options.flexScoreFlag) { string fileNameExtractContext = fileNameExtract + ".context" + (options.gzOutput?".gz":""); extractFileContext.Open(fileNameExtractContext.c_str()); if (!options.onlyDirectFlag) { string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.gzOutput?".gz":""); extractFileContextInv.Open(fileNameExtractContextInv.c_str()); } } // stats on labels for glue grammar and unknown word label probabilities set< string > targetLabelCollection, sourceLabelCollection; map< string, int > targetTopLabelCollection, sourceTopLabelCollection; // loop through all sentence pairs size_t i=sentenceOffset; string targetString, sourceString, alignmentString; while(getline(*tFileP, targetString)) { i++; getline(*sFileP, sourceString); getline(*aFileP, alignmentString); if (i%1000 == 0) cerr << i << " " << flush; SentenceAlignmentWithSyntax sentence (targetLabelCollection, sourceLabelCollection, targetTopLabelCollection, sourceTopLabelCollection, options); //az: output src, tgt, and alingment line if (options.onlyOutputSpanInfo) { cout << "LOG: SRC: " << sourceString << endl; cout << "LOG: TGT: " << targetString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) { if (options.unknownWordLabelFlag) { collectWordLabelCounts(sentence); } ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv, extractFileContext, extractFileContextInv); task->Run(); delete task; } if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } tFile.Close(); sFile.Close(); aFile.Close(); // only close if we actually opened it if (!options.onlyOutputSpanInfo) { extractFile.Close(); if (!options.onlyDirectFlag) extractFileInv.Close(); } if (options.flexScoreFlag) { extractFileContext.Close(); if (!options.onlyDirectFlag) extractFileContextInv.Close(); } if (options.glueGrammarFlag) writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection); if (options.unknownWordLabelFlag) writeUnknownWordLabel(fileNameUnknownWordLabel); }
int main(int argc, char* argv[]) { cerr << "extract-rules, written by Philipp Koehn\n" << "rule extraction from an aligned parallel corpus\n"; RuleExtractionOptions options; #ifdef WITH_THREADS int thread_count = 1; #endif if (argc < 5) { cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract [" #ifdef WITH_THREADS << " --threads NUM |" #endif << " --GlueGrammar FILE" << " | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --OutputNTLengths" << " | --MaxSpan[" << options.maxSpan << "]" << " | --MinHoleTarget[" << options.minHoleTarget << "]" << " | --MinHoleSource[" << options.minHoleSource << "]" << " | --MinWords[" << options.minWords << "]" << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]" << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]" << " | --MaxNonTerm[" << options.maxNonTerm << "]" << " | --MaxScope[" << options.maxScope << "]" << " | --SourceSyntax | --TargetSyntax" << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource | --NoNonTermFirstWord | --NoFractionalCounting ]\n"; exit(1); } char* &fileNameT = argv[1]; char* &fileNameS = argv[2]; char* &fileNameA = argv[3]; string fileNameGlueGrammar; string fileNameUnknownWordLabel; string fileNameExtract = string(argv[4]); int optionInd = 5; for(int i=optionInd; i<argc; i++) { // maximum span length if (strcmp(argv[i],"--MaxSpan") == 0) { options.maxSpan = atoi(argv[++i]); if (options.maxSpan < 1) { cerr << "extract error: --maxSpan should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleTarget") == 0) { options.minHoleTarget = atoi(argv[++i]); if (options.minHoleTarget < 1) { cerr << "extract error: --minHoleTarget should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleSource") == 0) { options.minHoleSource = atoi(argv[++i]); if (options.minHoleSource < 1) { cerr << "extract error: --minHoleSource should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) { options.maxSymbolsTarget = atoi(argv[++i]); if (options.maxSymbolsTarget < 1) { cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) { options.maxSymbolsSource = atoi(argv[++i]); if (options.maxSymbolsSource < 1) { cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl; exit(1); } } // minimum number of words in hierarchical phrase else if (strcmp(argv[i],"--MinWords") == 0) { options.minWords = atoi(argv[++i]); if (options.minWords < 0) { cerr << "extract error: --MinWords should be at least 0" << endl; exit(1); } } // maximum number of non-terminals else if (strcmp(argv[i],"--MaxNonTerm") == 0) { options.maxNonTerm = atoi(argv[++i]); if (options.maxNonTerm < 1) { cerr << "extract error: --MaxNonTerm should be at least 1" << endl; exit(1); } } // maximum scope (see Hopkins and Langmead (2010)) else if (strcmp(argv[i],"--MaxScope") == 0) { options.maxScope = atoi(argv[++i]); if (options.maxScope < 0) { cerr << "extract error: --MaxScope should be at least 0" << endl; exit(1); } } else if (strcmp(argv[i], "--GZOutput") == 0) { options.gzOutput = true; } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { options.targetSyntax = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { options.sourceSyntax = true; } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) { options.requireAlignedWord = false; } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) { options.nonTermConsecTarget = false; } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) { options.nonTermConsecSource = true; } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) { options.nonTermFirstWord = false; } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { options.onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"--OnlyDirect") == 0) { options.onlyDirectFlag = true; } else if (strcmp(argv[i],"--GlueGrammar") == 0) { options.glueGrammarFlag = true; if (++i >= argc) { cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; exit(0); } fileNameGlueGrammar = string(argv[i]); cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { options.unknownWordLabelFlag = true; if (++i >= argc) { cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; exit(0); } fileNameUnknownWordLabel = string(argv[i]); cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; } // TODO: this should be a useful option //else if (strcmp(argv[i],"--ZipFiles") == 0) { // zipFiles = true; //} // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--NoFractionalCounting") == 0) { options.fractionalCounting = false; } else if (strcmp(argv[i],"--OutputNTLengths") == 0) { options.outputNTLengths = true; #ifdef WITH_THREADS } else if (strcmp(argv[i],"-threads") == 0 || strcmp(argv[i],"--threads") == 0 || strcmp(argv[i],"--Threads") == 0) { thread_count = atoi(argv[++i]); #endif } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } cerr << "extracting hierarchical rules" << endl; // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); Moses::InputFileStream aFile(fileNameA); istream *tFileP = &tFile; istream *sFileP = &sFile; istream *aFileP = &aFile; // open output files string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":""); Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; extractFile.Open((fileNameExtract + (options.gzOutput?".gz":"")).c_str()); if (!options.onlyDirectFlag) extractFileInv.Open(fileNameExtractInv.c_str()); // output into file Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile); Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv); // stats on labels for glue grammar and unknown word label probabilities set< string > targetLabelCollection, sourceLabelCollection; map< string, int > targetTopLabelCollection, sourceTopLabelCollection; #ifdef WITH_THREADS // set up thread pool Moses::ThreadPool pool(thread_count); pool.SetQueueLimit(1000); #endif // loop through all sentence pairs size_t i=0; while(true) { i++; if (i%1000 == 0) cerr << "." << flush; if (i%10000 == 0) cerr << ":" << flush; if (i%100000 == 0) cerr << "!" << flush; char targetString[LINE_MAX_LENGTH]; char sourceString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__); if (tFileP->eof()) break; SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax (targetLabelCollection, sourceLabelCollection, targetTopLabelCollection, sourceTopLabelCollection, options); //az: output src, tgt, and alingment line if (options.onlyOutputSpanInfo) { cout << "LOG: SRC: " << sourceString << endl; cout << "LOG: TGT: " << targetString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence->create(targetString, sourceString, alignmentString, i)) { if (options.unknownWordLabelFlag) { collectWordLabelCounts(*sentence); } ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv); #ifdef WITH_THREADS if (thread_count == 1) { task->Run(); delete task; } else { pool.Submit(task); } #else task->Run(); delete task; #endif } if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } #ifdef WITH_THREADS // wait for all threads to finish pool.Stop(true); #endif tFile.Close(); sFile.Close(); aFile.Close(); // only close if we actually opened it if (!options.onlyOutputSpanInfo) { extractFile.Close(); if (!options.onlyDirectFlag) extractFileInv.Close(); } if (options.glueGrammarFlag) writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection); if (options.unknownWordLabelFlag) writeUnknownWordLabel(fileNameUnknownWordLabel); }