Beispiel #1
0
int main(int argc, char* argv[]) 
{
  cerr << "PhraseExtract v1.3.0, written by Philipp Koehn\n"
       << "phrase extraction from an aligned parallel corpus\n";
  time_t starttime = time(NULL);

  if (argc != 6 && argc != 7) {
    cerr << "syntax: phrase-extract en de align extract max-length [orientation]\n";
    exit(1);
  }
  char* &fileNameE = argv[1];
  char* &fileNameF = argv[2];
  char* &fileNameA = argv[3];
  fileNameExtract = argv[4];
  maxPhraseLength = atoi(argv[5]);
  orientationFlag = (argc == 7);
  if (orientationFlag) cerr << "(also extracting orientation)\n";

  //  string fileNameE = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.en";
  //  string fileNameF = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.de";
  //  string fileNameA = "/data/nlp/koehn/europarl-v2/models/de-en/model/aligned.grow-diag-final";

  ifstream eFile;
  ifstream fFile;
  ifstream aFile;
  eFile.open(fileNameE);
  fFile.open(fileNameF);
  aFile.open(fileNameA);
  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;
  
  // string fileNameExtract = "/data/nlp/koehn/europarl-v2/models/de-en/model/new-extract";

  int i=0;
  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    if (sentence.create( englishString, foreignString, alignmentString, i ))
      extract(sentence);
  }

  eFile.close();
  fFile.close();
  aFile.close();
  extractFile.close();
  extractFileInv.close();
}
bool BilingualDynSuffixArray::ExtractPhrases(const int& sntIndex, const int& wordIndex,	
	const int& sourceSize, std::vector<PhrasePair*>& phrasePairs, bool trg2Src) const 
{
	/* ExtractPhrases() can extract the matching phrases for both directions by using the trg2Src 
	 * parameter */
	SentenceAlignment curSnt = GetSentenceAlignment(sntIndex, trg2Src);
	// get span of phrase in source sentence 
	int beginSentence = m_srcSntBreaks[sntIndex];
	int rightIdx = wordIndex - beginSentence
			,leftIdx = rightIdx - sourceSize + 1;
	return curSnt.Extract(m_maxPhraseLength, phrasePairs, leftIdx, rightIdx); // extract all phrase Alignments in sentence
}
int main(int argc, char* argv[]) {
  cerr	<< "PhraseExtract " << extract_version << endl << "Written by Philipp Koehn" << endl
				<< "Modified by Ventsislav Zhechev, Autodesk Development Sàrl" << endl
				<< "phrase extraction from an aligned parallel corpus" << endl
	;
	
  if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono]-max_distance ] | --OnlyOutputSpanInfo]\n";
    exit(1);
  }
	
  const string fileNameE = argv[1];
  const string fileNameF = argv[2];
  const string fileNameA = argv[3];
  const string fileNameExtract = argv[4];
  maxPhraseLength = atoi(argv[5]);
	
  for (int i=6; i<argc; ++i) {
    if (strcmp(argv[i], "--OnlyOutputSpanInfo") == 0)
      onlyOutputSpanInfo = true;
    else if (strcmp(argv[i], "orientation") == 0 || strcmp(argv[i], "--Orientation") == 0)
      orientationFlag = true;
		else if (strcmp(argv[i], "--pipeOut") == 0)
			pipeOut = true;
    else if (strcmp(argv[i], "--model") == 0) {
      if (i+1 >= argc) {
				cerr << "extract: syntax error, no model information provided to the option --model " << endl;
				exit(1);
      }
			
			
      const string modelParams = argv[++i];
      const string modelName = modelParams.substr(0, modelParams.find('-'));
      const string modelType = modelParams.substr(modelParams.find('-') + 1);
//			modelMaxDistance = modelParams.find('-', modelParams.find('-') + 1) ? atoi(modelParams.substr(modelParams.find_last_of('-') + 1).c_str()) : 6;
			
      if (modelName == "wbe"){
				wordModel = true;
				if (modelType == "msd")
					wordType = REO_MSD;
				else if (modelType == "mslr")
					wordType = REO_MSLR;
				else if (modelType == "mono" || modelType == "monotonicity")
					wordType = REO_MONO;
				else {
					cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
					exit(1);
				}
      } else if (modelName == "phrase") {
				phraseModel = true;
				if (modelType == "msd")
					phraseType = REO_MSD;
				else if (modelType == "mslr")
					phraseType = REO_MSLR;
				else if (modelType == "mono" || modelType == "monotonicity")
					phraseType = REO_MONO;
				else {
					cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
					exit(1);
				}
      } else if (modelName == "hier") {
				hierModel = true;
				if (modelType == "msd")
					hierType = REO_MSD;
				else if (modelType == "mslr")
					hierType = REO_MSLR;
				else if (modelType == "mono" || modelType == "monotonicity")
					hierType = REO_MONO;
				else {
					cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
					exit(1);
				}
      } else {
				cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
				exit(1);
      }
			
      allModelsOutputFlag = true;
    } else {
      cerr << "extract: syntax error, unknown option '" << argv[i] << "'" << endl;
      exit(1);
    }
  }
	
  // default reordering model if no model selected
  // allows for the old syntax to be used
  if(orientationFlag && !allModelsOutputFlag) {
    wordModel = true;
    wordType = REO_MSD;
  }
	
  // open input files
	Bz2LineReader eFile(fileNameE);
	Bz2LineReader fFile(fileNameF);
	Bz2LineReader aFile(fileNameA);
	
  // open output files
	cerr << "Outputting to " << (pipeOut ? "pipes" : "bzip2-ed files") << "…" << endl;
	string extention = pipeOut ? ".pipe" : ".bz2";
  extractFile = new Bz2LineWriter(fileNameExtract + extention);
  extractFileInv = new Bz2LineWriter(fileNameExtract + ".inv" + extention);
  if (orientationFlag)
    extractFileOrientation = new Bz2LineWriter(fileNameExtract + ".o" + extention);
	
	for (int i = 0;;) {
		if ((++i)%500000 == 0) cerr << "[extract:" << i << "]" << flush;
    else if (i%10000 == 0) cerr << "." << flush;
		
		string englishString = eFile.readLine();
		if (englishString.empty()) {
//			cerr << "Finished extraction at line " << i << "!" << endl;
			break;
		}
		string foreignString = fFile.readLine();
		string alignmentString = aFile.readLine();

    SentenceAlignment sentence;

    //az: output src, tgt, and alingment line
    if (onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
		
    if (sentence.create(englishString, foreignString, alignmentString, i))
      extract(sentence);
    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }
	
  eFile.close();
  fFile.close();
  aFile.close();
  //az: only close if we actually opened it
	if (!onlyOutputSpanInfo) {
		extractFile->close();
		extractFileInv->close();
		if (orientationFlag) extractFileOrientation->close();
	}
}
Beispiel #4
0
int main(int argc, char* argv[]) 
{
  cerr << "PhraseExtract v1.4, written by Philipp Koehn\n"
       << "phrase extraction from an aligned parallel corpus\n";
  time_t starttime = time(NULL);

  if (argc < 6) {
    cerr << "syntax: phrase-extract en de align extract max-length [orientation | --OnlyOutputSpanInfo | --NoFileLimit | --ProperConditioning ]\n";
    exit(1);
  }
  char* &fileNameE = argv[1];
  char* &fileNameF = argv[2];
  char* &fileNameA = argv[3];
  fileNameExtract = argv[4];
  maxPhraseLength = atoi(argv[5]);
  
  for(int i=6;i<argc;i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      onlyOutputSpanInfo = true;
    }
    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
      noFileLimit = true;
    }
    else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
      orientationFlag = true;
    }
    else if (strcmp(argv[i],"--ZipFiles") == 0) {
      zipFiles = true;
    }
    else if (strcmp(argv[i],"--ProperConditioning") == 0) {
      properConditioning = true;
    }
    else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }
  ifstream eFile;
  ifstream fFile;
  ifstream aFile;
  eFile.open(fileNameE);
  fFile.open(fileNameF);
  aFile.open(fileNameA);
  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;
  
  int i=0;
  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n');
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n');
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n');
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
    if (onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
      
    if (sentence.create( englishString, foreignString, alignmentString, i )) {
      extract(sentence);
      if (properConditioning) extractBase(sentence);
    }
    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  eFile.close();
  fFile.close();
  aFile.close();
  //az: only close if we actually opened it
  if (!onlyOutputSpanInfo) {
    extractFile.close();
    extractFileInv.close();
    if (orientationFlag) extractFileOrientation.close();
  }
}
int main(int argc, char* argv[])
{
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";

  if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
    exit(1);
  }
  char* &fileNameE = argv[1];
  char* &fileNameF = argv[2];
  char* &fileNameA = argv[3];
  string fileNameExtract = string(argv[4]);
  maxPhraseLength = atoi(argv[5]);

  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
      orientationFlag = true;
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
      translationFlag = false;
    } else if (strcmp(argv[i], "--SentenceId") == 0) {
      sentenceIdFlag = true;  
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      gzOutput = true;  
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
        exit(1);
      }
      char* modelParams = argv[++i];
      char* modelName = strtok(modelParams, "-");
      char* modelType = strtok(NULL, "-");

      REO_MODEL_TYPE intModelType;

      if(strcmp(modelName, "wbe") == 0) {
        wordModel = true;
        if(strcmp(modelType, "msd") == 0)
          wordType = REO_MSD;
        else if(strcmp(modelType, "mslr") == 0)
          wordType = REO_MSLR;
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          wordType = REO_MONO;
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
        phraseModel = true;
        if(strcmp(modelType, "msd") == 0)
          phraseType = REO_MSD;
        else if(strcmp(modelType, "mslr") == 0)
          phraseType = REO_MSLR;
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          phraseType = REO_MONO;
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
        hierModel = true;
        if(strcmp(modelType, "msd") == 0)
          hierType = REO_MSD;
        else if(strcmp(modelType, "mslr") == 0)
          hierType = REO_MSLR;
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          hierType = REO_MONO;
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else {
        cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
        exit(1);
      }

      allModelsOutputFlag = true;
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  // default reordering model if no model selected
  // allows for the old syntax to be used
  if(orientationFlag && !allModelsOutputFlag) {
    wordModel = true;
    wordType = REO_MSD;
  }

  // open input files
  Moses::InputFileStream eFile(fileNameE);
  Moses::InputFileStream fFile(fileNameF);
  Moses::InputFileStream aFile(fileNameA);

  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;

  // open output files
  if (translationFlag) {
    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
    extractFileInv.Open(fileNameExtractInv.c_str());
  }
  if (orientationFlag) {
    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }

  if (sentenceIdFlag) {
    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
    extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
  }

  int i=0;
  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
    if (onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence.create( englishString, foreignString, alignmentString, i)) {
      extract(sentence);
    }
    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }
  eFile.Close();
  fFile.Close();
  aFile.Close();
  //az: only close if we actually opened it
  if (!onlyOutputSpanInfo) {
    if (translationFlag) {
      extractFile.Close();
      extractFileInv.Close();
    }
    if (orientationFlag) extractFileOrientation.Close();
    if (sentenceIdFlag) {
      extractFileSentenceId.Close();
    }
  }
}
Beispiel #6
0
int main(int argc, char* argv[]) 
{
  cerr << "Extract v2.0, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";
  //time_t starttime = time(NULL);
	
	Global *global = new Global();
	g_global = global;
	int sentenceOffset = 0;
		
	if (argc < 5) {
		cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
		     << " [ --Hierarchical | --Orientation"
				 << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
				 << " | --OnlyDirect"
					
					<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
					<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
					<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
					<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"

				<< " | --MaxSymbols[" << global->maxSymbols<< "]"
				 << " | --MaxNonTerm[" << global->maxNonTerm << "]"
		     << " | --SourceSyntax | --TargetSyntax" 
				<<	" | --UppermostOnly[" << g_global->uppermostOnly << "]"
				<< endl;
		exit(1);
	}
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
	string fileNameGlueGrammar;
 	string fileNameUnknownWordLabel;
	string fileNameExtract = string(argv[4]);

	int optionInd = 5;

  for(int i=optionInd;i<argc;i++) 
	{
		if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
			global->minHoleSpanSourceDefault = atoi(argv[++i]);
			if (global->minHoleSpanSourceDefault < 1) {
				cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
				exit(1);
			}
		}
		else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
			global->maxHoleSpanSourceDefault = atoi(argv[++i]);
			if (global->maxHoleSpanSourceDefault < 1) {
				cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
				exit(1);
			}
		}
		else  if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
			global->minHoleSpanSourceSyntax = atoi(argv[++i]);
			if (global->minHoleSpanSourceSyntax < 1) {
				cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
				exit(1);
			}
		}
		else if (strcmp(argv[i],"--UppermostOnly") == 0) {
			global->uppermostOnly = atoi(argv[++i]);
		}
		else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
			global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
			if (global->maxHoleSpanSourceSyntax < 1) {
				cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
				exit(1);
			}
		}
		
		// maximum number of words in hierarchical phrase
		else if (strcmp(argv[i],"--maxSymbols") == 0) {
			global->maxSymbols = atoi(argv[++i]);
			if (global->maxSymbols < 1) {
				cerr << "extract error: --maxSymbols should be at least 1" << endl;
				exit(1);
			}
		}
		// maximum number of non-terminals
		else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
			global->maxNonTerm = atoi(argv[++i]);
			if (global->maxNonTerm < 1) {
				cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
				exit(1);
			}
		}		
		// allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      global->targetSyntax = true;
    }
    else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      global->sourceSyntax = true;
    }
		// do not create many part00xx files!
    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
      // now default
    }
		else if (strcmp(argv[i],"--GlueGrammar") == 0) {
			global->glueGrammarFlag = true;
			if (++i >= argc)
			{
				cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
				exit(0);
			}
			fileNameGlueGrammar = string(argv[i]);
			cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    }
		else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
			global->unknownWordLabelFlag = true;
			if (++i >= argc)
			{
				cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
				exit(0);
			}
			fileNameUnknownWordLabel = string(argv[i]);
			cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
		}
		// TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
		// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--Mixed") == 0) {
			global->mixed = true;
    }
		else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
			global->allowDefaultNonTermEdge = atoi(argv[++i]);
    }
		else if (strcmp(argv[i], "--GZOutput") == 0) {
      global->gzOutput = true;
    }
		else if (strcmp(argv[i],"--MaxSpan") == 0) {
		  // ignore
      ++i;
		}
    else if (strcmp(argv[i],"--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    }
    else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }


	// open input files
	Moses::InputFileStream tFile(fileNameT);
	Moses::InputFileStream sFile(fileNameS);
	Moses::InputFileStream aFile(fileNameA);

	// open output files
  string fileNameExtractInv = fileNameExtract + ".inv";
  if (global->gzOutput) {
    fileNameExtract += ".gz";
    fileNameExtractInv += ".gz";
  }

  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  extractFile.Open(fileNameExtract.c_str());
  extractFileInv.Open(fileNameExtractInv.c_str());
  
  
	// loop through all sentence pairs
  int i = sentenceOffset;
  while(true) {
    i++;

    if (i % 1000 == 0) {
      cerr << i << " " << flush;
    }

    string targetString;
    string sourceString;
    string alignmentString;
		
		bool ok = getline(tFile, targetString);
		if (!ok)
			break;
		getline(sFile, sourceString);
		getline(aFile, alignmentString);
    
		//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;

		//time_t currTime = time(NULL);
		//cerr << "A " << (currTime - starttime) << endl;

    SentenceAlignment sentencePair;
    if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) 
		{			
			//cerr << sentence.sourceTree << endl;
			//cerr << sentence.targetTree << endl;

			sentencePair.FindTunnels(*g_global);
			//cerr << "C " << (time(NULL) - starttime) << endl;
			//cerr << sentencePair << endl;
			
			sentencePair.CreateLattice(*g_global);
			//cerr << "D " << (time(NULL) - starttime) << endl;
			//cerr << sentencePair << endl;

			sentencePair.CreateRules(*g_global);
			//cerr << "E " << (time(NULL) - starttime) << endl;

			//cerr << sentence.lattice->GetRules().GetSize() << endl;
			sentencePair.GetLattice().GetRules().Output(extractFile);
      sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
    }
  }
	
  tFile.Close();
  sFile.Close();
  aFile.Close();

  extractFile.Close();
  extractFileInv.Close();

  if (global->glueGrammarFlag) {
    writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
  }

  delete global;
}
int main(int argc, char* argv[])
{
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";

  if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
    exit(1);
  }

  Moses::OutputFileStream extractFileOrientation;
  const char* const &fileNameE = argv[1];
  const char* const &fileNameF = argv[2];
  const char* const &fileNameA = argv[3];
  const string fileNameExtract = string(argv[4]);
  PhraseExtractionOptions options(atoi(argv[5]));

  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.initOnlyOutputSpanInfo(true);
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
      options.initOrientationFlag(true);
    } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
      options.initFlexScoreFlag(true);
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
      options.initTranslationFlag(false);
    } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
      options.initIncludeSentenceIdFlag(true);
    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.initGzOutput(true);
    } else if (strcmp(argv[i], "--InstanceWeights") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
        exit(1);
      }
      options.initInstanceWeightsFile(argv[++i]);
    } else if (strcmp(argv[i], "--Debug") == 0) {
    	options.debug = true;
    } else if (strcmp(argv[i], "--MinPhraseLength") == 0) {
    	options.minPhraseLength = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--Separator") == 0) {
    	options.separator = argv[++i];
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
        exit(1);
      }
      char*  modelParams = argv[++i];
      char*  modelName = strtok(modelParams, "-");
      char*  modelType = strtok(NULL, "-");

      // REO_MODEL_TYPE intModelType;

      if(strcmp(modelName, "wbe") == 0) {
        options.initWordModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initWordType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initWordType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initWordType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
        options.initPhraseModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initPhraseType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initPhraseType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initPhraseType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
        options.initHierModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initHierType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initHierType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initHierType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else {
        cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
        exit(1);
      }

      options.initAllModelsOutputFlag(true);
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  // default reordering model if no model selected
  // allows for the old syntax to be used
  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
    options.initWordModel(true);
    options.initWordType(REO_MSD);
  }

  // open input files
  Moses::InputFileStream eFile(fileNameE);
  Moses::InputFileStream fFile(fileNameF);
  Moses::InputFileStream aFile(fileNameA);

  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;

  istream *iwFileP = NULL;
  auto_ptr<Moses::InputFileStream> instanceWeightsFile;
  if (options.getInstanceWeightsFile().length()) {
    instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile()));
    iwFileP = instanceWeightsFile.get();
  }

  // open output files
  if (options.isOrientationFlag()) {
    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }

  int i = sentenceOffset;

  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    char weightString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (iwFileP) {
      SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
    }
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
    if (options.isOnlyOutputSpanInfo()) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
    if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
      task->Run();
      delete task;

    }
    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  eFile.Close();
  fFile.Close();
  aFile.Close();

  //az: only close if we actually opened it
  if (!options.isOnlyOutputSpanInfo()) {
    if (options.isOrientationFlag()) {
      extractFileOrientation.Close();
    }
  }
}