예제 #1
0
int main(int argc, char* argv[])
{
  cerr << "extract-rules, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";

  RuleExtractionOptions options;
#ifdef WITH_THREADS
  int thread_count = 1;
#endif
  if (argc < 5) {
    cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
#ifdef WITH_THREADS
         << " --threads NUM |"
#endif
         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
         << " | --OutputNTLengths"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
         << " | --MinWords[" << options.minWords << "]"
         << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
         << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
         << " | --MaxNonTerm[" << options.maxNonTerm << "]"
         << " | --MaxScope[" << options.maxScope << "]"
         << " | --SourceSyntax | --TargetSyntax"
         << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting ]\n";
    exit(1);
  }
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
  string fileNameGlueGrammar;
  string fileNameUnknownWordLabel;
  string fileNameExtract = string(argv[4]);

  int optionInd = 5;

  for(int i=optionInd; i<argc; i++) {
    // maximum span length
    if (strcmp(argv[i],"--MaxSpan") == 0) {
      options.maxSpan = atoi(argv[++i]);
      if (options.maxSpan < 1) {
        cerr << "extract error: --maxSpan should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
      options.minHoleTarget = atoi(argv[++i]);
      if (options.minHoleTarget < 1) {
        cerr << "extract error: --minHoleTarget should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
      options.minHoleSource = atoi(argv[++i]);
      if (options.minHoleSource < 1) {
        cerr << "extract error: --minHoleSource should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
      options.maxSymbolsTarget = atoi(argv[++i]);
      if (options.maxSymbolsTarget < 1) {
        cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
      options.maxSymbolsSource = atoi(argv[++i]);
      if (options.maxSymbolsSource < 1) {
        cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
        exit(1);
      }
    }
    // minimum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MinWords") == 0) {
      options.minWords = atoi(argv[++i]);
      if (options.minWords < 0) {
        cerr << "extract error: --MinWords should be at least 0" << endl;
        exit(1);
      }
    }
    // maximum number of non-terminals
    else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
      options.maxNonTerm = atoi(argv[++i]);
      if (options.maxNonTerm < 1) {
        cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum scope (see Hopkins and Langmead (2010))
    else if (strcmp(argv[i],"--MaxScope") == 0) {
      options.maxScope = atoi(argv[++i]);
      if (options.maxScope < 0) {
        cerr << "extract error: --MaxScope should be at least 0" << endl;
        exit(1);
      }
    }
    else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.gzOutput = true;  
    } 
    // allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      options.targetSyntax = true;
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      options.sourceSyntax = true;
    } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
      options.requireAlignedWord = false;
    } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
      options.nonTermConsecTarget = false;
    } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
      options.nonTermConsecSource = true;
    } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
      options.nonTermFirstWord = false;
    } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      options.onlyDirectFlag = true;
    } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
      options.glueGrammarFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
        exit(0);
      }
      fileNameGlueGrammar = string(argv[i]);
      cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
      options.unknownWordLabelFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
        exit(0);
      }
      fileNameUnknownWordLabel = string(argv[i]);
      cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
    }
    // TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
    // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
      options.outputNTLengths = true;
#ifdef WITH_THREADS
    } else if (strcmp(argv[i],"-threads") == 0 || 
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
      thread_count = atoi(argv[++i]);
#endif
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  cerr << "extracting hierarchical rules" << endl;

  // open input files
  Moses::InputFileStream tFile(fileNameT);
  Moses::InputFileStream sFile(fileNameS);
  Moses::InputFileStream aFile(fileNameA);

  istream *tFileP = &tFile;
  istream *sFileP = &sFile;
  istream *aFileP = &aFile;

  // open output files
  string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  extractFile.Open((fileNameExtract  + (options.gzOutput?".gz":"")).c_str());
  if (!options.onlyDirectFlag)
    extractFileInv.Open(fileNameExtractInv.c_str());

  // output into file
  Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
  Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);

  // stats on labels for glue grammar and unknown word label probabilities
  set< string > targetLabelCollection, sourceLabelCollection;
  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;

#ifdef WITH_THREADS
  // set up thread pool
  Moses::ThreadPool pool(thread_count);
  pool.SetQueueLimit(1000);
#endif

  // loop through all sentence pairs
  size_t i=0;
  while(true) {
    i++;
    if (i%1000 == 0) cerr << "." << flush;
    if (i%10000 == 0) cerr << ":" << flush;
    if (i%100000 == 0) cerr << "!" << flush;
    char targetString[LINE_MAX_LENGTH];
    char sourceString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (tFileP->eof()) break;
    SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);

    SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
      (targetLabelCollection, sourceLabelCollection, 
       targetTopLabelCollection, sourceTopLabelCollection, options);
    //az: output src, tgt, and alingment line
    if (options.onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << sourceString << endl;
      cout << "LOG: TGT: " << targetString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence->create(targetString, sourceString, alignmentString, i)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(*sentence);
      }
      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
#ifdef WITH_THREADS
      if (thread_count == 1) {
        task->Run();
        delete task;
      }
      else {
        pool.Submit(task);
      }
#else
      task->Run();
      delete task;
#endif
    }
    if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

#ifdef WITH_THREADS
  // wait for all threads to finish
  pool.Stop(true);
#endif

  tFile.Close();
  sFile.Close();
  aFile.Close();
  // only close if we actually opened it
  if (!options.onlyOutputSpanInfo) {
    extractFile.Close();
    if (!options.onlyDirectFlag) extractFileInv.Close();
  }

  if (options.glueGrammarFlag)
    writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);

  if (options.unknownWordLabelFlag)
    writeUnknownWordLabel(fileNameUnknownWordLabel);
}
int main(int argc, char* argv[])
{
  cerr << "extract-rules, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";

  RuleExtractionOptions options;
  int sentenceOffset = 0;
#ifdef WITH_THREADS
  int thread_count = 1;
#endif
  if (argc < 5) {
    cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["

         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
         << " | --MinWords[" << options.minWords << "]"
         << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
         << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
         << " | --MaxNonTerm[" << options.maxNonTerm << "]"
         << " | --MaxScope[" << options.maxScope << "]"
         << " | --SourceSyntax | --TargetSyntax"
         << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting"
         << " | --UnpairedExtractFormat"
         << " | --ConditionOnTargetLHS ]"
         << " | --BoundaryRules[" << options.boundaryRules << "]"
         << " | --FlexibilityScore\n";

    exit(1);
  }
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
  string fileNameGlueGrammar;
  string fileNameUnknownWordLabel;
  string fileNameExtract = string(argv[4]);

  int optionInd = 5;

  for(int i=optionInd; i<argc; i++) {
    // maximum span length
    if (strcmp(argv[i],"--MaxSpan") == 0) {
      options.maxSpan = atoi(argv[++i]);
      if (options.maxSpan < 1) {
        cerr << "extract error: --maxSpan should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
      options.minHoleTarget = atoi(argv[++i]);
      if (options.minHoleTarget < 1) {
        cerr << "extract error: --minHoleTarget should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
      options.minHoleSource = atoi(argv[++i]);
      if (options.minHoleSource < 1) {
        cerr << "extract error: --minHoleSource should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
      options.maxSymbolsTarget = atoi(argv[++i]);
      if (options.maxSymbolsTarget < 1) {
        cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
      options.maxSymbolsSource = atoi(argv[++i]);
      if (options.maxSymbolsSource < 1) {
        cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
        exit(1);
      }
    }
    // minimum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MinWords") == 0) {
      options.minWords = atoi(argv[++i]);
      if (options.minWords < 0) {
        cerr << "extract error: --MinWords should be at least 0" << endl;
        exit(1);
      }
    }
    // maximum number of non-terminals
    else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
      options.maxNonTerm = atoi(argv[++i]);
      if (options.maxNonTerm < 1) {
        cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum scope (see Hopkins and Langmead (2010))
    else if (strcmp(argv[i],"--MaxScope") == 0) {
      options.maxScope = atoi(argv[++i]);
      if (options.maxScope < 0) {
        cerr << "extract error: --MaxScope should be at least 0" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.gzOutput = true;
    }
    // allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      options.targetSyntax = true;
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      options.sourceSyntax = true;
    } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
      options.requireAlignedWord = false;
    } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
      options.nonTermConsecTarget = false;
    } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
      options.nonTermConsecSource = true;
    } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
      options.nonTermFirstWord = false;
    } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      options.onlyDirectFlag = true;
    } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
      options.glueGrammarFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
        exit(0);
      }
      fileNameGlueGrammar = string(argv[i]);
      cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
      options.unknownWordLabelFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
        exit(0);
      }
      fileNameUnknownWordLabel = string(argv[i]);
      cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
    }
    // TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
    // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      options.pcfgScore = true;
    } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
      options.unpairedExtractFormat = true;
    } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
      options.conditionOnTargetLhs = true;
    } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
      options.flexScoreFlag = true;
    } else if (strcmp(argv[i],"-threads") == 0 ||
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
#ifdef WITH_THREADS
      thread_count = atoi(argv[++i]);
#else
      cerr << "thread support not compiled in." << '\n';
      exit(1);
#endif
    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
      options.boundaryRules = true;
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  cerr << "extracting hierarchical rules" << endl;

  // open input files
  Moses::InputFileStream tFile(fileNameT);
  Moses::InputFileStream sFile(fileNameS);
  Moses::InputFileStream aFile(fileNameA);

  istream *tFileP = &tFile;
  istream *sFileP = &sFile;
  istream *aFileP = &aFile;

  // open output files
  string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  Moses::OutputFileStream extractFileContext;
  Moses::OutputFileStream extractFileContextInv;
  extractFile.Open((fileNameExtract  + (options.gzOutput?".gz":"")).c_str());
  if (!options.onlyDirectFlag)
    extractFileInv.Open(fileNameExtractInv.c_str());

  if (options.flexScoreFlag) {
    string fileNameExtractContext = fileNameExtract + ".context" + (options.gzOutput?".gz":"");
    extractFileContext.Open(fileNameExtractContext.c_str());
    if (!options.onlyDirectFlag) {
      string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.gzOutput?".gz":"");
      extractFileContextInv.Open(fileNameExtractContextInv.c_str());
    }
  }

  // stats on labels for glue grammar and unknown word label probabilities
  set< string > targetLabelCollection, sourceLabelCollection;
  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;

  // loop through all sentence pairs
  size_t i=sentenceOffset;
  string targetString, sourceString, alignmentString;

  while(getline(*tFileP, targetString)) {
    i++;

    getline(*sFileP, sourceString);
    getline(*aFileP, alignmentString);

    if (i%1000 == 0) cerr << i << " " << flush;

    SentenceAlignmentWithSyntax sentence
    (targetLabelCollection, sourceLabelCollection,
     targetTopLabelCollection, sourceTopLabelCollection, options);
    //az: output src, tgt, and alingment line
    if (options.onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << sourceString << endl;
      cout << "LOG: TGT: " << targetString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(sentence);
      }
      ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv, extractFileContext, extractFileContextInv);
      task->Run();
      delete task;
    }
    if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  tFile.Close();
  sFile.Close();
  aFile.Close();
  // only close if we actually opened it
  if (!options.onlyOutputSpanInfo) {
    extractFile.Close();
    if (!options.onlyDirectFlag) extractFileInv.Close();
  }

  if (options.flexScoreFlag) {
    extractFileContext.Close();
    if (!options.onlyDirectFlag) extractFileContextInv.Close();
  }

  if (options.glueGrammarFlag)
    writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);

  if (options.unknownWordLabelFlag)
    writeUnknownWordLabel(fileNameUnknownWordLabel);
}
int main(int argc, char* argv[])
{
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";

  if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
    exit(1);
  }

  Moses::OutputFileStream extractFileOrientation;
  const char* const &fileNameE = argv[1];
  const char* const &fileNameF = argv[2];
  const char* const &fileNameA = argv[3];
  const string fileNameExtract = string(argv[4]);
  PhraseExtractionOptions options(atoi(argv[5]));

  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.initOnlyOutputSpanInfo(true);
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
      options.initOrientationFlag(true);
    } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
      options.initFlexScoreFlag(true);
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
      options.initTranslationFlag(false);
    } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
      options.initIncludeSentenceIdFlag(true);
    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.initGzOutput(true);
    } else if (strcmp(argv[i], "--InstanceWeights") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
        exit(1);
      }
      options.initInstanceWeightsFile(argv[++i]);
    } else if (strcmp(argv[i], "--Debug") == 0) {
    	options.debug = true;
    } else if (strcmp(argv[i], "--MinPhraseLength") == 0) {
    	options.minPhraseLength = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--Separator") == 0) {
    	options.separator = argv[++i];
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
        exit(1);
      }
      char*  modelParams = argv[++i];
      char*  modelName = strtok(modelParams, "-");
      char*  modelType = strtok(NULL, "-");

      // REO_MODEL_TYPE intModelType;

      if(strcmp(modelName, "wbe") == 0) {
        options.initWordModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initWordType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initWordType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initWordType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
        options.initPhraseModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initPhraseType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initPhraseType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initPhraseType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
        options.initHierModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initHierType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initHierType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initHierType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else {
        cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
        exit(1);
      }

      options.initAllModelsOutputFlag(true);
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  // default reordering model if no model selected
  // allows for the old syntax to be used
  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
    options.initWordModel(true);
    options.initWordType(REO_MSD);
  }

  // open input files
  Moses::InputFileStream eFile(fileNameE);
  Moses::InputFileStream fFile(fileNameF);
  Moses::InputFileStream aFile(fileNameA);

  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;

  istream *iwFileP = NULL;
  auto_ptr<Moses::InputFileStream> instanceWeightsFile;
  if (options.getInstanceWeightsFile().length()) {
    instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile()));
    iwFileP = instanceWeightsFile.get();
  }

  // open output files
  if (options.isOrientationFlag()) {
    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }

  int i = sentenceOffset;

  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    char weightString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (iwFileP) {
      SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
    }
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
    if (options.isOnlyOutputSpanInfo()) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
    if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
      task->Run();
      delete task;

    }
    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  eFile.Close();
  fFile.Close();
  aFile.Close();

  //az: only close if we actually opened it
  if (!options.isOnlyOutputSpanInfo()) {
    if (options.isOrientationFlag()) {
      extractFileOrientation.Close();
    }
  }
}