예제 #1
0
int main(int argc, char* argv[]) 
{
  cerr << "Extract v2.0, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";
  //time_t starttime = time(NULL);
	
	Global *global = new Global();
	g_global = global;
	int sentenceOffset = 0;
		
	if (argc < 5) {
		cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
		     << " [ --Hierarchical | --Orientation"
				 << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
				 << " | --OnlyDirect"
					
					<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
					<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
					<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
					<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"

				<< " | --MaxSymbols[" << global->maxSymbols<< "]"
				 << " | --MaxNonTerm[" << global->maxNonTerm << "]"
		     << " | --SourceSyntax | --TargetSyntax" 
				<<	" | --UppermostOnly[" << g_global->uppermostOnly << "]"
				<< endl;
		exit(1);
	}
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
	string fileNameGlueGrammar;
 	string fileNameUnknownWordLabel;
	string fileNameExtract = string(argv[4]);

	int optionInd = 5;

  for(int i=optionInd;i<argc;i++) 
	{
		if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
			global->minHoleSpanSourceDefault = atoi(argv[++i]);
			if (global->minHoleSpanSourceDefault < 1) {
				cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
				exit(1);
			}
		}
		else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
			global->maxHoleSpanSourceDefault = atoi(argv[++i]);
			if (global->maxHoleSpanSourceDefault < 1) {
				cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
				exit(1);
			}
		}
		else  if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
			global->minHoleSpanSourceSyntax = atoi(argv[++i]);
			if (global->minHoleSpanSourceSyntax < 1) {
				cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
				exit(1);
			}
		}
		else if (strcmp(argv[i],"--UppermostOnly") == 0) {
			global->uppermostOnly = atoi(argv[++i]);
		}
		else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
			global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
			if (global->maxHoleSpanSourceSyntax < 1) {
				cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
				exit(1);
			}
		}
		
		// maximum number of words in hierarchical phrase
		else if (strcmp(argv[i],"--maxSymbols") == 0) {
			global->maxSymbols = atoi(argv[++i]);
			if (global->maxSymbols < 1) {
				cerr << "extract error: --maxSymbols should be at least 1" << endl;
				exit(1);
			}
		}
		// maximum number of non-terminals
		else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
			global->maxNonTerm = atoi(argv[++i]);
			if (global->maxNonTerm < 1) {
				cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
				exit(1);
			}
		}		
		// allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      global->targetSyntax = true;
    }
    else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      global->sourceSyntax = true;
    }
		// do not create many part00xx files!
    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
      // now default
    }
		else if (strcmp(argv[i],"--GlueGrammar") == 0) {
			global->glueGrammarFlag = true;
			if (++i >= argc)
			{
				cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
				exit(0);
			}
			fileNameGlueGrammar = string(argv[i]);
			cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    }
		else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
			global->unknownWordLabelFlag = true;
			if (++i >= argc)
			{
				cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
				exit(0);
			}
			fileNameUnknownWordLabel = string(argv[i]);
			cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
		}
		// TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
		// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--Mixed") == 0) {
			global->mixed = true;
    }
		else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
			global->allowDefaultNonTermEdge = atoi(argv[++i]);
    }
		else if (strcmp(argv[i], "--GZOutput") == 0) {
      global->gzOutput = true;
    }
		else if (strcmp(argv[i],"--MaxSpan") == 0) {
		  // ignore
      ++i;
		}
    else if (strcmp(argv[i],"--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    }
    else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }


	// open input files
	Moses::InputFileStream tFile(fileNameT);
	Moses::InputFileStream sFile(fileNameS);
	Moses::InputFileStream aFile(fileNameA);

	// open output files
  string fileNameExtractInv = fileNameExtract + ".inv";
  if (global->gzOutput) {
    fileNameExtract += ".gz";
    fileNameExtractInv += ".gz";
  }

  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  extractFile.Open(fileNameExtract.c_str());
  extractFileInv.Open(fileNameExtractInv.c_str());
  
  
	// loop through all sentence pairs
  int i = sentenceOffset;
  while(true) {
    i++;

    if (i % 1000 == 0) {
      cerr << i << " " << flush;
    }

    string targetString;
    string sourceString;
    string alignmentString;
		
		bool ok = getline(tFile, targetString);
		if (!ok)
			break;
		getline(sFile, sourceString);
		getline(aFile, alignmentString);
    
		//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;

		//time_t currTime = time(NULL);
		//cerr << "A " << (currTime - starttime) << endl;

    SentenceAlignment sentencePair;
    if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) 
		{			
			//cerr << sentence.sourceTree << endl;
			//cerr << sentence.targetTree << endl;

			sentencePair.FindTunnels(*g_global);
			//cerr << "C " << (time(NULL) - starttime) << endl;
			//cerr << sentencePair << endl;
			
			sentencePair.CreateLattice(*g_global);
			//cerr << "D " << (time(NULL) - starttime) << endl;
			//cerr << sentencePair << endl;

			sentencePair.CreateRules(*g_global);
			//cerr << "E " << (time(NULL) - starttime) << endl;

			//cerr << sentence.lattice->GetRules().GetSize() << endl;
			sentencePair.GetLattice().GetRules().Output(extractFile);
      sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
    }
  }
	
  tFile.Close();
  sFile.Close();
  aFile.Close();

  extractFile.Close();
  extractFileInv.Close();

  if (global->glueGrammarFlag) {
    writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
  }

  delete global;
}