int main(int argc, char* argv[]) { cerr << "Extract v2.0, written by Philipp Koehn\n" << "rule extraction from an aligned parallel corpus\n"; //time_t starttime = time(NULL); Global *global = new Global(); g_global = global; int sentenceOffset = 0; if (argc < 5) { cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract " << " [ --Hierarchical | --Orientation" << " | --GlueGrammar FILE | --UnknownWordLabel FILE" << " | --OnlyDirect" << " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]" << " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]" << " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]" << " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]" << " | --MaxSymbols[" << global->maxSymbols<< "]" << " | --MaxNonTerm[" << global->maxNonTerm << "]" << " | --SourceSyntax | --TargetSyntax" << " | --UppermostOnly[" << g_global->uppermostOnly << "]" << endl; exit(1); } char* &fileNameT = argv[1]; char* &fileNameS = argv[2]; char* &fileNameA = argv[3]; string fileNameGlueGrammar; string fileNameUnknownWordLabel; string fileNameExtract = string(argv[4]); int optionInd = 5; for(int i=optionInd;i<argc;i++) { if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) { global->minHoleSpanSourceDefault = atoi(argv[++i]); if (global->minHoleSpanSourceDefault < 1) { cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) { global->maxHoleSpanSourceDefault = atoi(argv[++i]); if (global->maxHoleSpanSourceDefault < 1) { cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) { global->minHoleSpanSourceSyntax = atoi(argv[++i]); if (global->minHoleSpanSourceSyntax < 1) { cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl; exit(1); } } else if (strcmp(argv[i],"--UppermostOnly") == 0) { global->uppermostOnly = atoi(argv[++i]); } else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) { global->maxHoleSpanSourceSyntax = atoi(argv[++i]); if (global->maxHoleSpanSourceSyntax < 1) { cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl; exit(1); } } // maximum number of words in hierarchical phrase else if (strcmp(argv[i],"--maxSymbols") == 0) { global->maxSymbols = atoi(argv[++i]); if (global->maxSymbols < 1) { cerr << "extract error: --maxSymbols should be at least 1" << endl; exit(1); } } // maximum number of non-terminals else if (strcmp(argv[i],"--MaxNonTerm") == 0) { global->maxNonTerm = atoi(argv[++i]); if (global->maxNonTerm < 1) { cerr << "extract error: --MaxNonTerm should be at least 1" << endl; exit(1); } } // allow consecutive non-terminals (X Y | X Y) else if (strcmp(argv[i],"--TargetSyntax") == 0) { global->targetSyntax = true; } else if (strcmp(argv[i],"--SourceSyntax") == 0) { global->sourceSyntax = true; } // do not create many part00xx files! else if (strcmp(argv[i],"--NoFileLimit") == 0) { // now default } else if (strcmp(argv[i],"--GlueGrammar") == 0) { global->glueGrammarFlag = true; if (++i >= argc) { cerr << "ERROR: Option --GlueGrammar requires a file name" << endl; exit(0); } fileNameGlueGrammar = string(argv[i]); cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl; } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) { global->unknownWordLabelFlag = true; if (++i >= argc) { cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl; exit(0); } fileNameUnknownWordLabel = string(argv[i]); cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl; } // TODO: this should be a useful option //else if (strcmp(argv[i],"--ZipFiles") == 0) { // zipFiles = true; //} // if an source phrase is paired with two target phrases, then count(t|s) = 0.5 else if (strcmp(argv[i],"--Mixed") == 0) { global->mixed = true; } else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) { global->allowDefaultNonTermEdge = atoi(argv[++i]); } else if (strcmp(argv[i], "--GZOutput") == 0) { global->gzOutput = true; } else if (strcmp(argv[i],"--MaxSpan") == 0) { // ignore ++i; } else if (strcmp(argv[i],"--SentenceOffset") == 0) { if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') { cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl; exit(1); } sentenceOffset = atoi(argv[++i]); } else { cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n"; exit(1); } } // open input files Moses::InputFileStream tFile(fileNameT); Moses::InputFileStream sFile(fileNameS); Moses::InputFileStream aFile(fileNameA); // open output files string fileNameExtractInv = fileNameExtract + ".inv"; if (global->gzOutput) { fileNameExtract += ".gz"; fileNameExtractInv += ".gz"; } Moses::OutputFileStream extractFile; Moses::OutputFileStream extractFileInv; extractFile.Open(fileNameExtract.c_str()); extractFileInv.Open(fileNameExtractInv.c_str()); // loop through all sentence pairs int i = sentenceOffset; while(true) { i++; if (i % 1000 == 0) { cerr << i << " " << flush; } string targetString; string sourceString; string alignmentString; bool ok = getline(tFile, targetString); if (!ok) break; getline(sFile, sourceString); getline(aFile, alignmentString); //cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl; //time_t currTime = time(NULL); //cerr << "A " << (currTime - starttime) << endl; SentenceAlignment sentencePair; if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) { //cerr << sentence.sourceTree << endl; //cerr << sentence.targetTree << endl; sentencePair.FindTunnels(*g_global); //cerr << "C " << (time(NULL) - starttime) << endl; //cerr << sentencePair << endl; sentencePair.CreateLattice(*g_global); //cerr << "D " << (time(NULL) - starttime) << endl; //cerr << sentencePair << endl; sentencePair.CreateRules(*g_global); //cerr << "E " << (time(NULL) - starttime) << endl; //cerr << sentence.lattice->GetRules().GetSize() << endl; sentencePair.GetLattice().GetRules().Output(extractFile); sentencePair.GetLattice().GetRules().OutputInv(extractFileInv); } } tFile.Close(); sFile.Close(); aFile.Close(); extractFile.Close(); extractFileInv.Close(); if (global->glueGrammarFlag) { writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection); } delete global; }