int main(int argc, char* argv[]) { cerr << "PhraseExtract v1.3.0, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; time_t starttime = time(NULL); if (argc != 6 && argc != 7) { cerr << "syntax: phrase-extract en de align extract max-length [orientation]\n"; exit(1); } char* &fileNameE = argv[1]; char* &fileNameF = argv[2]; char* &fileNameA = argv[3]; fileNameExtract = argv[4]; maxPhraseLength = atoi(argv[5]); orientationFlag = (argc == 7); if (orientationFlag) cerr << "(also extracting orientation)\n"; // string fileNameE = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.en"; // string fileNameF = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.de"; // string fileNameA = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.grow-diag-final"; ifstream eFile; ifstream fFile; ifstream aFile; eFile.open(fileNameE); fFile.open(fileNameF); aFile.open(fileNameA); istream *eFileP = &eFile; istream *fFileP = &fFile; istream *aFileP = &aFile; // string fileNameExtract = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract"; int i=0; while(true) { i++; if (i%10000 == 0) cerr << "." << flush; char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n'); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n'); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n'); SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; if (sentence.create( englishString, foreignString, alignmentString, i )) extract(sentence); } eFile.close(); fFile.close(); aFile.close(); extractFile.close(); extractFileInv.close(); }
int main(int argc, char* argv[]) { cerr << "PhraseExtract " << extract_version << endl << "Written by Philipp Koehn" << endl << "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl << "phrase extraction from an aligned parallel corpus" << endl ; if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono]-max_distance ] | --OnlyOutputSpanInfo]\n"; exit(1); } const string fileNameE = argv[1]; const string fileNameF = argv[2]; const string fileNameA = argv[3]; const string fileNameExtract = argv[4]; maxPhraseLength = atoi(argv[5]); for (int i=6; i<argc; ++i) { if (strcmp(argv[i], "--OnlyOutputSpanInfo") == 0) onlyOutputSpanInfo = true; else if (strcmp(argv[i], "orientation") == 0 || strcmp(argv[i], "--Orientation") == 0) orientationFlag = true; else if (strcmp(argv[i], "--pipeOut") == 0) pipeOut = true; else if (strcmp(argv[i], "--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model information provided to the option --model " << endl; exit(1); } const string modelParams = argv[++i]; const string modelName = modelParams.substr(0, modelParams.find('-')); const string modelType = modelParams.substr(modelParams.find('-') + 1); // modelMaxDistance = modelParams.find('-', modelParams.find('-') + 1) ? atoi(modelParams.substr(modelParams.find_last_of('-') + 1).c_str()) : 6; if (modelName == "wbe"){ wordModel = true; if (modelType == "msd") wordType = REO_MSD; else if (modelType == "mslr") wordType = REO_MSLR; else if (modelType == "mono" || modelType == "monotonicity") wordType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if (modelName == "phrase") { phraseModel = true; if (modelType == "msd") phraseType = REO_MSD; else if (modelType == "mslr") phraseType = REO_MSLR; else if (modelType == "mono" || modelType == "monotonicity") phraseType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if (modelName == "hier") { hierModel = true; if (modelType == "msd") hierType = REO_MSD; else if (modelType == "mslr") hierType = REO_MSLR; else if (modelType == "mono" || modelType == "monotonicity") hierType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else { cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; exit(1); } allModelsOutputFlag = true; } else { cerr << "extract: syntax error, unknown option '" << argv[i] << "'" << endl; exit(1); } } // default reordering model if no model selected // allows for the old syntax to be used if(orientationFlag && !allModelsOutputFlag) { wordModel = true; wordType = REO_MSD; } // open input files Bz2LineReader eFile(fileNameE); Bz2LineReader fFile(fileNameF); Bz2LineReader aFile(fileNameA); // open output files cerr << "Outputting to " << (pipeOut ? "pipes" : "bzip2-ed files") << "…" << endl; string extention = pipeOut ? ".pipe" : ".bz2"; extractFile = new Bz2LineWriter(fileNameExtract + extention); extractFileInv = new Bz2LineWriter(fileNameExtract + ".inv" + extention); if (orientationFlag) extractFileOrientation = new Bz2LineWriter(fileNameExtract + ".o" + extention); for (int i = 0;;) { if ((++i)%500000 == 0) cerr << "[extract:" << i << "]" << flush; else if (i%10000 == 0) cerr << "." << flush; string englishString = eFile.readLine(); if (englishString.empty()) { // cerr << "Finished extraction at line " << i << "!" << endl; break; } string foreignString = fFile.readLine(); string alignmentString = aFile.readLine(); SentenceAlignment sentence; //az: output src, tgt, and alingment line if (onlyOutputSpanInfo) { cout << "LOG: SRC: " << foreignString << endl; cout << "LOG: TGT: " << englishString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create(englishString, foreignString, alignmentString, i)) extract(sentence); if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } eFile.close(); fFile.close(); aFile.close(); //az: only close if we actually opened it if (!onlyOutputSpanInfo) { extractFile->close(); extractFileInv->close(); if (orientationFlag) extractFileOrientation->close(); } }
int main(int argc, char* argv[]) { cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; time_t starttime = time(NULL); if (argc < 6) { cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning ]\n"; exit(1); } char* &fileNameE = argv[1]; char* &fileNameF = argv[2]; char* &fileNameA = argv[3]; fileNameExtract = argv[4]; maxPhraseLength = atoi(argv[5]); for(int i=6;i<argc;i++) { if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"--NoFileLimit") == 0) { noFileLimit = true; } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { orientationFlag = true; } else if (strcmp(argv[i],"--ZipFiles") == 0) { zipFiles = true; } else if (strcmp(argv[i],"--ProperConditioning") == 0) { properConditioning = true; } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } ifstream eFile; ifstream fFile; ifstream aFile; eFile.open(fileNameE); fFile.open(fileNameF); aFile.open(fileNameA); istream *eFileP = &eFile; istream *fFileP = &fFile; istream *aFileP = &aFile; int i=0; while(true) { i++; if (i%10000 == 0) cerr << "." << flush; char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n'); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n'); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n'); SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (onlyOutputSpanInfo) { cout << "LOG: SRC: " << foreignString << endl; cout << "LOG: TGT: " << englishString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, i )) { extract(sentence); if (properConditioning) extractBase(sentence); } if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } eFile.close(); fFile.close(); aFile.close(); //az: only close if we actually opened it if (!onlyOutputSpanInfo) { extractFile.close(); extractFileInv.close(); if (orientationFlag) extractFileOrientation.close(); } }
int main(int argc, char* argv[]) { cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n"; exit(1); } char* &fileNameE = argv[1]; char* &fileNameF = argv[2]; char* &fileNameA = argv[3]; string fileNameExtract = string(argv[4]); maxPhraseLength = atoi(argv[5]); for(int i=6; i<argc; i++) { if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { onlyOutputSpanInfo = true; } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { orientationFlag = true; } else if (strcmp(argv[i],"--NoTTable") == 0) { translationFlag = false; } else if (strcmp(argv[i], "--SentenceId") == 0) { sentenceIdFlag = true; } else if (strcmp(argv[i], "--GZOutput") == 0) { gzOutput = true; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; exit(1); } char* modelParams = argv[++i]; char* modelName = strtok(modelParams, "-"); char* modelType = strtok(NULL, "-"); REO_MODEL_TYPE intModelType; if(strcmp(modelName, "wbe") == 0) { wordModel = true; if(strcmp(modelType, "msd") == 0) wordType = REO_MSD; else if(strcmp(modelType, "mslr") == 0) wordType = REO_MSLR; else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) wordType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "phrase") == 0) { phraseModel = true; if(strcmp(modelType, "msd") == 0) phraseType = REO_MSD; else if(strcmp(modelType, "mslr") == 0) phraseType = REO_MSLR; else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) phraseType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "hier") == 0) { hierModel = true; if(strcmp(modelType, "msd") == 0) hierType = REO_MSD; else if(strcmp(modelType, "mslr") == 0) hierType = REO_MSLR; else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) hierType = REO_MONO; else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else { cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; exit(1); } allModelsOutputFlag = true; } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } // default reordering model if no model selected // allows for the old syntax to be used if(orientationFlag && !allModelsOutputFlag) { wordModel = true; wordType = REO_MSD; } // open input files Moses::InputFileStream eFile(fileNameE); Moses::InputFileStream fFile(fileNameF); Moses::InputFileStream aFile(fileNameA); istream *eFileP = &eFile; istream *fFileP = &fFile; istream *aFileP = &aFile; // open output files if (translationFlag) { string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":""); extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str()); extractFileInv.Open(fileNameExtractInv.c_str()); } if (orientationFlag) { string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } if (sentenceIdFlag) { string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":""); extractFileSentenceId.Open(fileNameExtractSentenceId.c_str()); } int i=0; while(true) { i++; if (i%10000 == 0) cerr << "." << flush; char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (onlyOutputSpanInfo) { cout << "LOG: SRC: " << foreignString << endl; cout << "LOG: TGT: " << englishString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, i)) { extract(sentence); } if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } eFile.Close(); fFile.Close(); aFile.Close(); //az: only close if we actually opened it if (!onlyOutputSpanInfo) { if (translationFlag) { extractFile.Close(); extractFileInv.Close(); } if (orientationFlag) extractFileOrientation.Close(); if (sentenceIdFlag) { extractFileSentenceId.Close(); } } }
int main(int argc, char* argv[]) { cerr << "PhraseExtract v1.4, written by Philipp Koehn\n" << "phrase extraction from an aligned parallel corpus\n"; if (argc < 6) { cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] "; cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n"; exit(1); } Moses::OutputFileStream extractFileOrientation; const char* const &fileNameE = argv[1]; const char* const &fileNameF = argv[2]; const char* const &fileNameA = argv[3]; const string fileNameExtract = string(argv[4]); PhraseExtractionOptions options(atoi(argv[5])); for(int i=6; i<argc; i++) { if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) { options.initOnlyOutputSpanInfo(true); } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) { options.initOrientationFlag(true); } else if (strcmp(argv[i],"--FlexibilityScore") == 0) { options.initFlexScoreFlag(true); } else if (strcmp(argv[i],"--NoTTable") == 0) { options.initTranslationFlag(false); } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) { options.initIncludeSentenceIdFlag(true); } else if (strcmp(argv[i], "--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; exit(1); } sentenceOffset = atoi(argv[++i]); } else if (strcmp(argv[i], "--GZOutput") == 0) { options.initGzOutput(true); } else if (strcmp(argv[i], "--InstanceWeights") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl; exit(1); } options.initInstanceWeightsFile(argv[++i]); } else if (strcmp(argv[i], "--Debug") == 0) { options.debug = true; } else if (strcmp(argv[i], "--MinPhraseLength") == 0) { options.minPhraseLength = atoi(argv[++i]); } else if (strcmp(argv[i], "--Separator") == 0) { options.separator = argv[++i]; } else if(strcmp(argv[i],"--model") == 0) { if (i+1 >= argc) { cerr << "extract: syntax error, no model's information provided to the option --model " << endl; exit(1); } char* modelParams = argv[++i]; char* modelName = strtok(modelParams, "-"); char* modelType = strtok(NULL, "-"); // REO_MODEL_TYPE intModelType; if(strcmp(modelName, "wbe") == 0) { options.initWordModel(true); if(strcmp(modelType, "msd") == 0) options.initWordType(REO_MSD); else if(strcmp(modelType, "mslr") == 0) options.initWordType(REO_MSLR); else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) options.initWordType(REO_MONO); else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "phrase") == 0) { options.initPhraseModel(true); if(strcmp(modelType, "msd") == 0) options.initPhraseType(REO_MSD); else if(strcmp(modelType, "mslr") == 0) options.initPhraseType(REO_MSLR); else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) options.initPhraseType(REO_MONO); else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else if(strcmp(modelName, "hier") == 0) { options.initHierModel(true); if(strcmp(modelType, "msd") == 0) options.initHierType(REO_MSD); else if(strcmp(modelType, "mslr") == 0) options.initHierType(REO_MSLR); else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0) options.initHierType(REO_MONO); else { cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl; exit(1); } } else { cerr << "extract: syntax error, unknown reordering model: " << modelName << endl; exit(1); } options.initAllModelsOutputFlag(true); } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } // default reordering model if no model selected // allows for the old syntax to be used if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) { options.initWordModel(true); options.initWordType(REO_MSD); } // open input files Moses::InputFileStream eFile(fileNameE); Moses::InputFileStream fFile(fileNameF); Moses::InputFileStream aFile(fileNameA); istream *eFileP = &eFile; istream *fFileP = &fFile; istream *aFileP = &aFile; istream *iwFileP = NULL; auto_ptr<Moses::InputFileStream> instanceWeightsFile; if (options.getInstanceWeightsFile().length()) { instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile())); iwFileP = instanceWeightsFile.get(); } // open output files if (options.isOrientationFlag()) { string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":""); extractFileOrientation.Open(fileNameExtractOrientation.c_str()); } int i = sentenceOffset; while(true) { i++; if (i%10000 == 0) cerr << "." << flush; char englishString[LINE_MAX_LENGTH]; char foreignString[LINE_MAX_LENGTH]; char alignmentString[LINE_MAX_LENGTH]; char weightString[LINE_MAX_LENGTH]; SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__); if (eFileP->eof()) break; SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__); SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__); if (iwFileP) { SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__); } SentenceAlignment sentence; // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl; //az: output src, tgt, and alingment line if (options.isOnlyOutputSpanInfo()) { cout << "LOG: SRC: " << foreignString << endl; cout << "LOG: TGT: " << englishString << endl; cout << "LOG: ALT: " << alignmentString << endl; cout << "LOG: PHRASES_BEGIN:" << endl; } if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) { ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation); task->Run(); delete task; } if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases } eFile.Close(); fFile.Close(); aFile.Close(); //az: only close if we actually opened it if (!options.isOnlyOutputSpanInfo()) { if (options.isOrientationFlag()) { extractFileOrientation.Close(); } } }