Esempio n. 1
0
void writeCountOfCounts( const string &fileNameCountOfCounts )
{
  // open file
	Moses::OutputFileStream countOfCountsFile;
	bool success = countOfCountsFile.Open(fileNameCountOfCounts.c_str());
	if (!success) {
		cerr << "ERROR: could not open count-of-counts file "
				 << fileNameCountOfCounts << endl;
    return;
	}

  // Kneser-Ney needs the total number of phrase pairs
  countOfCountsFile << totalDistinct << endl;

  // write out counts
  for(int i=1; i<=COC_MAX; i++) {
    countOfCountsFile << countOfCounts[ i ] << endl;
  }
	countOfCountsFile.Close();
}
Esempio n. 2
0
void processFiles( const std::string& fileNameDirect, 
                   const std::string& fileNameIndirect, 
                   const std::string& fileNameConsolidated, 
                   const std::string& fileNameCountOfCounts, 
                   const std::string& fileNameSourceLabelSet, 
                   const std::string& fileNamePartsOfSpeechVocabulary )
{
  if (goodTuringFlag || kneserNeyFlag)
    loadCountOfCounts( fileNameCountOfCounts );

  // open input files
  Moses::InputFileStream fileDirect(fileNameDirect);
  UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect);
  Moses::InputFileStream fileIndirect(fileNameIndirect);
  UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect);

  // open output file: consolidated phrase table
  Moses::OutputFileStream fileConsolidated;
  bool success = fileConsolidated.Open(fileNameConsolidated);
  UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated);

  // create properties consolidator
  // (in case any additional phrase property requires further processing)
  MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
  if (sourceLabelsFlag) {
    propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
  }
  if (partsOfSpeechFlag) {
    propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
  }

  // loop through all extracted phrase translations
  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) std::cerr << "." << std::flush;

    std::vector< std::string > itemDirect, itemIndirect;
    if (! getLine(fileIndirect, itemIndirect) ||
        ! getLine(fileDirect, itemDirect))
      break;

    // direct: target source alignment probabilities
    // indirect: source target probabilities

    // consistency checks
    UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0, 
                   "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'");
    UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0, 
                   "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'");

    // SCORES ...
    std::string directScores, directSparseScores, indirectScores, indirectSparseScores;
    breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
    breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );

    std::vector<std::string> directCounts;
    Moses::Tokenize( directCounts, itemDirect[4] );
    std::vector<std::string> indirectCounts;
    Moses::Tokenize( indirectCounts, itemIndirect[4] );
    float countF = Moses::Scan<float>(directCounts[0]);
    float countE = Moses::Scan<float>(indirectCounts[0]);
    float countEF = Moses::Scan<float>(indirectCounts[1]);
    float n1_F, n1_E;
    if (kneserNeyFlag) {
      n1_F = Moses::Scan<float>(directCounts[2]);
      n1_E = Moses::Scan<float>(indirectCounts[2]);
    }

    // Good Turing discounting
    float adjustedCountEF = countEF;
    if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
      adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
    float adjustedCountEF_indirect = adjustedCountEF;

    // Kneser Ney discounting [Foster et al, 2006]
    if (kneserNeyFlag) {
      float D = kneserNey_D3;
      if (countEF < 2) D = kneserNey_D1;
      else if (countEF < 3) D = kneserNey_D2;
      if (D > countEF) D = countEF - 0.01; // sanity constraint

      float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
      float alpha_F = D * n1_F / countF; // available mass
      adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;

      // for indirect
      float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
      float alpha_E = D * n1_E / countE; // available mass
      adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
    }

    // drop due to MinScore thresholding
    if ((minScore0 > 0 && adjustedCountEF_indirect/countE < minScore0) ||
        (minScore2 > 0 && adjustedCountEF         /countF < minScore2)) {
      continue;
    }

    // output phrase pair
    fileConsolidated << itemDirect[0] << " ||| ";

    if (partsOfSpeechFlag) {
      // write POS factor from property
      std::vector<std::string> targetTokens;
      Moses::Tokenize( targetTokens, itemDirect[1] );
      std::vector<std::string> propertyValuePOS;
      propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS);
      size_t targetTerminalIndex = 0;
      for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin();
           targetTokensIt!=targetTokens.end(); ++targetTokensIt) {
        fileConsolidated << *targetTokensIt;
        if (!isNonTerminal(*targetTokensIt)) {
          assert(propertyValuePOS.size() > targetTerminalIndex);
          fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex];
          ++targetTerminalIndex;
        }
        fileConsolidated << " ";
      }
      fileConsolidated << "|||";

    } else {

      fileConsolidated << itemDirect[1] << " |||";
    }


    // prob indirect
    if (!onlyDirectFlag) {
      fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
      fileConsolidated << " " << indirectScores;
    }

    // prob direct
    fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
    fileConsolidated << " " << directScores;

    // phrase count feature
    if (phraseCountFlag) {
      fileConsolidated << " " << maybeLogProb(2.718);
    }

    // low count feature
    if (lowCountFlag) {
      fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF));
    }

    // count bin feature (as a core feature)
    if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " " << maybeLogProb(2.718);
          foundBin = true;
        } else {
          fileConsolidated << " " << maybeLogProb(1);
        }
      }
      fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
    }

    // alignment
    fileConsolidated << " |||";
    if (!itemDirect[2].empty()) {
      fileConsolidated << " " << itemDirect[2];;
    }

    // counts, for debugging
    fileConsolidated << " ||| " << countE << " " << countF << " " << countEF;

    // sparse features
    fileConsolidated << " |||";
    if (directSparseScores.compare("") != 0)
      fileConsolidated << " " << directSparseScores;
    if (indirectSparseScores.compare("") != 0)
      fileConsolidated << " " << indirectSparseScores;

    // count bin feature (as a sparse feature)
    if (sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " cb_";
          if (i == 0 && countBin[i] > 1)
            fileConsolidated << "1_";
          else if (i > 0 && countBin[i-1]+1 < countBin[i])
            fileConsolidated << (countBin[i-1]+1) << "_";
          fileConsolidated << countBin[i] << " 1";
          foundBin = true;
        }
      }
      if (!foundBin) {
        fileConsolidated << " cb_max 1";
      }
    }

    // arbitrary key-value pairs
    fileConsolidated << " |||";
    if (itemDirect.size() >= 6) {
      propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated);
    }

    if (countsProperty) {
      fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}";
    }

    fileConsolidated << std::endl;
  }

  fileDirect.Close();
  fileIndirect.Close();
  fileConsolidated.Close();
}
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts )
{
  if (goodTuringFlag || kneserNeyFlag)
    loadCountOfCounts( fileNameCountOfCounts );

  // open input files
  Moses::InputFileStream fileDirect(fileNameDirect);
  Moses::InputFileStream fileIndirect(fileNameIndirect);

  if (fileDirect.fail()) {
    cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
    exit(1);
  }
  istream &fileDirectP = fileDirect;

  if (fileIndirect.fail()) {
    cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
    exit(1);
  }
  istream &fileIndirectP = fileIndirect;

  // open output file: consolidated phrase table
  Moses::OutputFileStream fileConsolidated;
  bool success = fileConsolidated.Open(fileNameConsolidated);
  if (!success) {
    cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
    exit(1);
  }

  // loop through all extracted phrase translations
  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) cerr << "." << flush;

    vector< string > itemDirect, itemIndirect;
    if (! getLine(fileIndirectP,itemIndirect) ||
        ! getLine(fileDirectP,  itemDirect  ))
      break;

    // direct: target source alignment probabilities
    // indirect: source target probabilities

    // consistency checks
    if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
      cerr << "ERROR: target phrase does not match in line " << i << ": '"
           << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
      exit(1);
    }

    if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
      cerr << "ERROR: source phrase does not match in line " << i << ": '"
           << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
      exit(1);
    }

    // output hierarchical phrase pair (with separated labels)
    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";

    // SCORES ...
    string directScores, directSparseScores, indirectScores, indirectSparseScores;
    breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
    breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );

    vector<string> directCounts = tokenize(itemDirect[4].c_str());
    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
    float countF = atof(directCounts[0].c_str());
    float countE = atof(indirectCounts[0].c_str());
    float countEF = atof(indirectCounts[1].c_str());
    float n1_F, n1_E;
    if (kneserNeyFlag) {
      n1_F = atof(directCounts[2].c_str());
      n1_E = atof(indirectCounts[2].c_str());
    }

    // Good Turing discounting
    float adjustedCountEF = countEF;
    if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
      adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
    float adjustedCountEF_indirect = adjustedCountEF;

    // Kneser Ney discounting [Foster et al, 2006]
    if (kneserNeyFlag) {
      float D = kneserNey_D3;
      if (countEF < 2) D = kneserNey_D1;
      else if (countEF < 3) D = kneserNey_D2;
      if (D > countEF) D = countEF - 0.01; // sanity constraint

      float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
      float alpha_F = D * n1_F / countF; // available mass
      adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;

      // for indirect
      float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
      float alpha_E = D * n1_E / countE; // available mass
      adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
    }

    // prob indirect
    if (!onlyDirectFlag) {
      fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
      fileConsolidated << " " << indirectScores;
    }

    // prob direct
    fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
    fileConsolidated << " " << directScores;

    // phrase count feature
    if (phraseCountFlag) {
      fileConsolidated << " " << maybeLogProb(2.718);
    }

    // low count feature
    if (lowCountFlag) {
      fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
    }

    // count bin feature (as a core feature)
    if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " " << maybeLogProb(2.718);
          foundBin = true;
        } else {
          fileConsolidated << " " << maybeLogProb(1);
        }
      }
      fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
    }

    // alignment
    fileConsolidated << " ||| " << itemDirect[2];

    // counts, for debugging
    fileConsolidated << "||| " << countE << " " << countF << " " << countEF;

    // count bin feature (as a sparse feature)
    fileConsolidated << " |||";
    if (directSparseScores.compare("") != 0)
      fileConsolidated << " " << directSparseScores;
    if (indirectSparseScores.compare("") != 0)
      fileConsolidated << " " << indirectSparseScores;
    if (sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " cb_";
          if (i == 0 && countBin[i] > 1)
            fileConsolidated << "1_";
          else if (i > 0 && countBin[i-1]+1 < countBin[i])
            fileConsolidated << (countBin[i-1]+1) << "_";
          fileConsolidated << countBin[i] << " 1";
          foundBin = true;
        }
      }
      if (!foundBin) {
        fileConsolidated << " cb_max 1";
      }
    }

    // arbitrary key-value pairs
    if (itemDirect.size() >= 6) {
      fileConsolidated << " ||| " << itemDirect[5];
    }

    fileConsolidated << endl;
  }
  fileDirect.Close();
  fileIndirect.Close();
  fileConsolidated.Close();
}
int main(int argc, char* argv[])
{
  cerr << "extract-rules, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";

  RuleExtractionOptions options;
  int sentenceOffset = 0;
#ifdef WITH_THREADS
  int thread_count = 1;
#endif
  if (argc < 5) {
    cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["

         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
         << " | --MinWords[" << options.minWords << "]"
         << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
         << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
         << " | --MaxNonTerm[" << options.maxNonTerm << "]"
         << " | --MaxScope[" << options.maxScope << "]"
         << " | --SourceSyntax | --TargetSyntax"
         << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting"
         << " | --UnpairedExtractFormat"
         << " | --ConditionOnTargetLHS ]"
         << " | --BoundaryRules[" << options.boundaryRules << "]"
         << " | --FlexibilityScore\n";

    exit(1);
  }
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
  string fileNameGlueGrammar;
  string fileNameUnknownWordLabel;
  string fileNameExtract = string(argv[4]);

  int optionInd = 5;

  for(int i=optionInd; i<argc; i++) {
    // maximum span length
    if (strcmp(argv[i],"--MaxSpan") == 0) {
      options.maxSpan = atoi(argv[++i]);
      if (options.maxSpan < 1) {
        cerr << "extract error: --maxSpan should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
      options.minHoleTarget = atoi(argv[++i]);
      if (options.minHoleTarget < 1) {
        cerr << "extract error: --minHoleTarget should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
      options.minHoleSource = atoi(argv[++i]);
      if (options.minHoleSource < 1) {
        cerr << "extract error: --minHoleSource should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
      options.maxSymbolsTarget = atoi(argv[++i]);
      if (options.maxSymbolsTarget < 1) {
        cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
      options.maxSymbolsSource = atoi(argv[++i]);
      if (options.maxSymbolsSource < 1) {
        cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
        exit(1);
      }
    }
    // minimum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MinWords") == 0) {
      options.minWords = atoi(argv[++i]);
      if (options.minWords < 0) {
        cerr << "extract error: --MinWords should be at least 0" << endl;
        exit(1);
      }
    }
    // maximum number of non-terminals
    else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
      options.maxNonTerm = atoi(argv[++i]);
      if (options.maxNonTerm < 1) {
        cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum scope (see Hopkins and Langmead (2010))
    else if (strcmp(argv[i],"--MaxScope") == 0) {
      options.maxScope = atoi(argv[++i]);
      if (options.maxScope < 0) {
        cerr << "extract error: --MaxScope should be at least 0" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.gzOutput = true;
    }
    // allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      options.targetSyntax = true;
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      options.sourceSyntax = true;
    } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
      options.requireAlignedWord = false;
    } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
      options.nonTermConsecTarget = false;
    } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
      options.nonTermConsecSource = true;
    } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
      options.nonTermFirstWord = false;
    } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      options.onlyDirectFlag = true;
    } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
      options.glueGrammarFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
        exit(0);
      }
      fileNameGlueGrammar = string(argv[i]);
      cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
      options.unknownWordLabelFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
        exit(0);
      }
      fileNameUnknownWordLabel = string(argv[i]);
      cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
    }
    // TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
    // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      options.pcfgScore = true;
    } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
      options.unpairedExtractFormat = true;
    } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
      options.conditionOnTargetLhs = true;
    } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
      options.flexScoreFlag = true;
    } else if (strcmp(argv[i],"-threads") == 0 ||
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
#ifdef WITH_THREADS
      thread_count = atoi(argv[++i]);
#else
      cerr << "thread support not compiled in." << '\n';
      exit(1);
#endif
    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i],"--BoundaryRules") == 0) {
      options.boundaryRules = true;
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  cerr << "extracting hierarchical rules" << endl;

  // open input files
  Moses::InputFileStream tFile(fileNameT);
  Moses::InputFileStream sFile(fileNameS);
  Moses::InputFileStream aFile(fileNameA);

  istream *tFileP = &tFile;
  istream *sFileP = &sFile;
  istream *aFileP = &aFile;

  // open output files
  string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  Moses::OutputFileStream extractFileContext;
  Moses::OutputFileStream extractFileContextInv;
  extractFile.Open((fileNameExtract  + (options.gzOutput?".gz":"")).c_str());
  if (!options.onlyDirectFlag)
    extractFileInv.Open(fileNameExtractInv.c_str());

  if (options.flexScoreFlag) {
    string fileNameExtractContext = fileNameExtract + ".context" + (options.gzOutput?".gz":"");
    extractFileContext.Open(fileNameExtractContext.c_str());
    if (!options.onlyDirectFlag) {
      string fileNameExtractContextInv = fileNameExtract + ".context.inv" + (options.gzOutput?".gz":"");
      extractFileContextInv.Open(fileNameExtractContextInv.c_str());
    }
  }

  // stats on labels for glue grammar and unknown word label probabilities
  set< string > targetLabelCollection, sourceLabelCollection;
  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;

  // loop through all sentence pairs
  size_t i=sentenceOffset;
  string targetString, sourceString, alignmentString;

  while(getline(*tFileP, targetString)) {
    i++;

    getline(*sFileP, sourceString);
    getline(*aFileP, alignmentString);

    if (i%1000 == 0) cerr << i << " " << flush;

    SentenceAlignmentWithSyntax sentence
    (targetLabelCollection, sourceLabelCollection,
     targetTopLabelCollection, sourceTopLabelCollection, options);
    //az: output src, tgt, and alingment line
    if (options.onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << sourceString << endl;
      cout << "LOG: TGT: " << targetString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence.create(targetString.c_str(), sourceString.c_str(), alignmentString.c_str(),"", i, options.boundaryRules)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(sentence);
      }
      ExtractTask *task = new ExtractTask(sentence, options, extractFile, extractFileInv, extractFileContext, extractFileContextInv);
      task->Run();
      delete task;
    }
    if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  tFile.Close();
  sFile.Close();
  aFile.Close();
  // only close if we actually opened it
  if (!options.onlyOutputSpanInfo) {
    extractFile.Close();
    if (!options.onlyDirectFlag) extractFileInv.Close();
  }

  if (options.flexScoreFlag) {
    extractFileContext.Close();
    if (!options.onlyDirectFlag) extractFileContextInv.Close();
  }

  if (options.glueGrammarFlag)
    writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);

  if (options.unknownWordLabelFlag)
    writeUnknownWordLabel(fileNameUnknownWordLabel);
}
int main(int argc, char* argv[])
{
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";

  if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] | --OnlyOutputSpanInfo | --NoTTable | --SentenceId]\n";
    exit(1);
  }
  char* &fileNameE = argv[1];
  char* &fileNameF = argv[2];
  char* &fileNameA = argv[3];
  string fileNameExtract = string(argv[4]);
  maxPhraseLength = atoi(argv[5]);

  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
      orientationFlag = true;
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
      translationFlag = false;
    } else if (strcmp(argv[i], "--SentenceId") == 0) {
      sentenceIdFlag = true;  
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      gzOutput = true;  
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
        exit(1);
      }
      char* modelParams = argv[++i];
      char* modelName = strtok(modelParams, "-");
      char* modelType = strtok(NULL, "-");

      REO_MODEL_TYPE intModelType;

      if(strcmp(modelName, "wbe") == 0) {
        wordModel = true;
        if(strcmp(modelType, "msd") == 0)
          wordType = REO_MSD;
        else if(strcmp(modelType, "mslr") == 0)
          wordType = REO_MSLR;
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          wordType = REO_MONO;
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
        phraseModel = true;
        if(strcmp(modelType, "msd") == 0)
          phraseType = REO_MSD;
        else if(strcmp(modelType, "mslr") == 0)
          phraseType = REO_MSLR;
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          phraseType = REO_MONO;
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
        hierModel = true;
        if(strcmp(modelType, "msd") == 0)
          hierType = REO_MSD;
        else if(strcmp(modelType, "mslr") == 0)
          hierType = REO_MSLR;
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          hierType = REO_MONO;
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else {
        cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
        exit(1);
      }

      allModelsOutputFlag = true;
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  // default reordering model if no model selected
  // allows for the old syntax to be used
  if(orientationFlag && !allModelsOutputFlag) {
    wordModel = true;
    wordType = REO_MSD;
  }

  // open input files
  Moses::InputFileStream eFile(fileNameE);
  Moses::InputFileStream fFile(fileNameF);
  Moses::InputFileStream aFile(fileNameA);

  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;

  // open output files
  if (translationFlag) {
    string fileNameExtractInv = fileNameExtract + ".inv" + (gzOutput?".gz":"");
    extractFile.Open( (fileNameExtract + (gzOutput?".gz":"")).c_str());
    extractFileInv.Open(fileNameExtractInv.c_str());
  }
  if (orientationFlag) {
    string fileNameExtractOrientation = fileNameExtract + ".o" + (gzOutput?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }

  if (sentenceIdFlag) {
    string fileNameExtractSentenceId = fileNameExtract + ".sid" + (gzOutput?".gz":"");
    extractFileSentenceId.Open(fileNameExtractSentenceId.c_str());
  }

  int i=0;
  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
    if (onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence.create( englishString, foreignString, alignmentString, i)) {
      extract(sentence);
    }
    if (onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }
  eFile.Close();
  fFile.Close();
  aFile.Close();
  //az: only close if we actually opened it
  if (!onlyOutputSpanInfo) {
    if (translationFlag) {
      extractFile.Close();
      extractFileInv.Close();
    }
    if (orientationFlag) extractFileOrientation.Close();
    if (sentenceIdFlag) {
      extractFileSentenceId.Close();
    }
  }
}
Esempio n. 6
0
int main(int argc, char* argv[])
{
  cerr << "extract-rules, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";

  RuleExtractionOptions options;
#ifdef WITH_THREADS
  int thread_count = 1;
#endif
  if (argc < 5) {
    cerr << "syntax: extract-rules corpus.target corpus.source corpus.align extract ["
#ifdef WITH_THREADS
         << " --threads NUM |"
#endif
         << " --GlueGrammar FILE"
         << " | --UnknownWordLabel FILE"
         << " | --OnlyDirect"
         << " | --OutputNTLengths"
         << " | --MaxSpan[" << options.maxSpan << "]"
         << " | --MinHoleTarget[" << options.minHoleTarget << "]"
         << " | --MinHoleSource[" << options.minHoleSource << "]"
         << " | --MinWords[" << options.minWords << "]"
         << " | --MaxSymbolsTarget[" << options.maxSymbolsTarget << "]"
         << " | --MaxSymbolsSource[" << options.maxSymbolsSource << "]"
         << " | --MaxNonTerm[" << options.maxNonTerm << "]"
         << " | --MaxScope[" << options.maxScope << "]"
         << " | --SourceSyntax | --TargetSyntax"
         << " | --AllowOnlyUnalignedWords | --DisallowNonTermConsecTarget |--NonTermConsecSource |  --NoNonTermFirstWord | --NoFractionalCounting ]\n";
    exit(1);
  }
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
  string fileNameGlueGrammar;
  string fileNameUnknownWordLabel;
  string fileNameExtract = string(argv[4]);

  int optionInd = 5;

  for(int i=optionInd; i<argc; i++) {
    // maximum span length
    if (strcmp(argv[i],"--MaxSpan") == 0) {
      options.maxSpan = atoi(argv[++i]);
      if (options.maxSpan < 1) {
        cerr << "extract error: --maxSpan should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleTarget") == 0) {
      options.minHoleTarget = atoi(argv[++i]);
      if (options.minHoleTarget < 1) {
        cerr << "extract error: --minHoleTarget should be at least 1" << endl;
        exit(1);
      }
    } else if (strcmp(argv[i],"--MinHoleSource") == 0) {
      options.minHoleSource = atoi(argv[++i]);
      if (options.minHoleSource < 1) {
        cerr << "extract error: --minHoleSource should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsTarget") == 0) {
      options.maxSymbolsTarget = atoi(argv[++i]);
      if (options.maxSymbolsTarget < 1) {
        cerr << "extract error: --MaxSymbolsTarget should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MaxSymbolsSource") == 0) {
      options.maxSymbolsSource = atoi(argv[++i]);
      if (options.maxSymbolsSource < 1) {
        cerr << "extract error: --MaxSymbolsSource should be at least 1" << endl;
        exit(1);
      }
    }
    // minimum number of words in hierarchical phrase
    else if (strcmp(argv[i],"--MinWords") == 0) {
      options.minWords = atoi(argv[++i]);
      if (options.minWords < 0) {
        cerr << "extract error: --MinWords should be at least 0" << endl;
        exit(1);
      }
    }
    // maximum number of non-terminals
    else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
      options.maxNonTerm = atoi(argv[++i]);
      if (options.maxNonTerm < 1) {
        cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
        exit(1);
      }
    }
    // maximum scope (see Hopkins and Langmead (2010))
    else if (strcmp(argv[i],"--MaxScope") == 0) {
      options.maxScope = atoi(argv[++i]);
      if (options.maxScope < 0) {
        cerr << "extract error: --MaxScope should be at least 0" << endl;
        exit(1);
      }
    }
    else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.gzOutput = true;  
    } 
    // allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      options.targetSyntax = true;
    } else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      options.sourceSyntax = true;
    } else if (strcmp(argv[i],"--AllowOnlyUnalignedWords") == 0) {
      options.requireAlignedWord = false;
    } else if (strcmp(argv[i],"--DisallowNonTermConsecTarget") == 0) {
      options.nonTermConsecTarget = false;
    } else if (strcmp(argv[i],"--NonTermConsecSource") == 0) {
      options.nonTermConsecSource = true;
    } else if (strcmp(argv[i],"--NoNonTermFirstWord") == 0) {
      options.nonTermFirstWord = false;
    } else if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.onlyOutputSpanInfo = true;
    } else if (strcmp(argv[i],"--OnlyDirect") == 0) {
      options.onlyDirectFlag = true;
    } else if (strcmp(argv[i],"--GlueGrammar") == 0) {
      options.glueGrammarFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
        exit(0);
      }
      fileNameGlueGrammar = string(argv[i]);
      cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    } else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
      options.unknownWordLabelFlag = true;
      if (++i >= argc) {
        cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
        exit(0);
      }
      fileNameUnknownWordLabel = string(argv[i]);
      cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
    }
    // TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
    // if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--NoFractionalCounting") == 0) {
      options.fractionalCounting = false;
    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
      options.outputNTLengths = true;
#ifdef WITH_THREADS
    } else if (strcmp(argv[i],"-threads") == 0 || 
               strcmp(argv[i],"--threads") == 0 ||
               strcmp(argv[i],"--Threads") == 0) {
      thread_count = atoi(argv[++i]);
#endif
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  cerr << "extracting hierarchical rules" << endl;

  // open input files
  Moses::InputFileStream tFile(fileNameT);
  Moses::InputFileStream sFile(fileNameS);
  Moses::InputFileStream aFile(fileNameA);

  istream *tFileP = &tFile;
  istream *sFileP = &sFile;
  istream *aFileP = &aFile;

  // open output files
  string fileNameExtractInv = fileNameExtract + ".inv" + (options.gzOutput?".gz":"");
  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  extractFile.Open((fileNameExtract  + (options.gzOutput?".gz":"")).c_str());
  if (!options.onlyDirectFlag)
    extractFileInv.Open(fileNameExtractInv.c_str());

  // output into file
  Moses::OutputCollector* extractCollector = new Moses::OutputCollector(&extractFile);
  Moses::OutputCollector* extractCollectorInv = new Moses::OutputCollector(&extractFileInv);

  // stats on labels for glue grammar and unknown word label probabilities
  set< string > targetLabelCollection, sourceLabelCollection;
  map< string, int > targetTopLabelCollection, sourceTopLabelCollection;

#ifdef WITH_THREADS
  // set up thread pool
  Moses::ThreadPool pool(thread_count);
  pool.SetQueueLimit(1000);
#endif

  // loop through all sentence pairs
  size_t i=0;
  while(true) {
    i++;
    if (i%1000 == 0) cerr << "." << flush;
    if (i%10000 == 0) cerr << ":" << flush;
    if (i%100000 == 0) cerr << "!" << flush;
    char targetString[LINE_MAX_LENGTH];
    char sourceString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*tFileP), targetString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (tFileP->eof()) break;
    SAFE_GETLINE((*sFileP), sourceString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);

    SentenceAlignmentWithSyntax *sentence = new SentenceAlignmentWithSyntax
      (targetLabelCollection, sourceLabelCollection, 
       targetTopLabelCollection, sourceTopLabelCollection, options);
    //az: output src, tgt, and alingment line
    if (options.onlyOutputSpanInfo) {
      cout << "LOG: SRC: " << sourceString << endl;
      cout << "LOG: TGT: " << targetString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }

    if (sentence->create(targetString, sourceString, alignmentString, i)) {
      if (options.unknownWordLabelFlag) {
        collectWordLabelCounts(*sentence);
      }
      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractCollector, extractCollectorInv);
#ifdef WITH_THREADS
      if (thread_count == 1) {
        task->Run();
        delete task;
      }
      else {
        pool.Submit(task);
      }
#else
      task->Run();
      delete task;
#endif
    }
    if (options.onlyOutputSpanInfo) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

#ifdef WITH_THREADS
  // wait for all threads to finish
  pool.Stop(true);
#endif

  tFile.Close();
  sFile.Close();
  aFile.Close();
  // only close if we actually opened it
  if (!options.onlyOutputSpanInfo) {
    extractFile.Close();
    if (!options.onlyDirectFlag) extractFileInv.Close();
  }

  if (options.glueGrammarFlag)
    writeGlueGrammar(fileNameGlueGrammar, options, targetLabelCollection, targetTopLabelCollection);

  if (options.unknownWordLabelFlag)
    writeUnknownWordLabel(fileNameUnknownWordLabel);
}
Esempio n. 7
0
int main(int argc, char* argv[]) 
{
  cerr << "Extract v2.0, written by Philipp Koehn\n"
       << "rule extraction from an aligned parallel corpus\n";
  //time_t starttime = time(NULL);
	
	Global *global = new Global();
	g_global = global;
	int sentenceOffset = 0;
		
	if (argc < 5) {
		cerr << "syntax: extract-mixed-syntax corpus.target corpus.source corpus.align extract "
		     << " [ --Hierarchical | --Orientation"
				 << " | --GlueGrammar FILE | --UnknownWordLabel FILE"
				 << " | --OnlyDirect"
					
					<< " | --MinHoleSpanSourceDefault[" << global->minHoleSpanSourceDefault << "]"
					<< " | --MaxHoleSpanSourceDefault[" << global->maxHoleSpanSourceDefault << "]"
					<< " | --MinHoleSpanSourceSyntax[" << global->minHoleSpanSourceSyntax << "]"
					<< " | --MaxHoleSpanSourceSyntax[" << global->maxHoleSpanSourceSyntax << "]"

				<< " | --MaxSymbols[" << global->maxSymbols<< "]"
				 << " | --MaxNonTerm[" << global->maxNonTerm << "]"
		     << " | --SourceSyntax | --TargetSyntax" 
				<<	" | --UppermostOnly[" << g_global->uppermostOnly << "]"
				<< endl;
		exit(1);
	}
  char* &fileNameT = argv[1];
  char* &fileNameS = argv[2];
  char* &fileNameA = argv[3];
	string fileNameGlueGrammar;
 	string fileNameUnknownWordLabel;
	string fileNameExtract = string(argv[4]);

	int optionInd = 5;

  for(int i=optionInd;i<argc;i++) 
	{
		if (strcmp(argv[i],"--MinHoleSpanSourceDefault") == 0) {
			global->minHoleSpanSourceDefault = atoi(argv[++i]);
			if (global->minHoleSpanSourceDefault < 1) {
				cerr << "extract error: --minHoleSourceDefault should be at least 1" << endl;
				exit(1);
			}
		}
		else if (strcmp(argv[i],"--MaxHoleSpanSourceDefault") == 0) {
			global->maxHoleSpanSourceDefault = atoi(argv[++i]);
			if (global->maxHoleSpanSourceDefault < 1) {
				cerr << "extract error: --maxHoleSourceDefault should be at least 1" << endl;
				exit(1);
			}
		}
		else  if (strcmp(argv[i],"--MinHoleSpanSourceSyntax") == 0) {
			global->minHoleSpanSourceSyntax = atoi(argv[++i]);
			if (global->minHoleSpanSourceSyntax < 1) {
				cerr << "extract error: --minHoleSourceSyntax should be at least 1" << endl;
				exit(1);
			}
		}
		else if (strcmp(argv[i],"--UppermostOnly") == 0) {
			global->uppermostOnly = atoi(argv[++i]);
		}
		else if (strcmp(argv[i],"--MaxHoleSpanSourceSyntax") == 0) {
			global->maxHoleSpanSourceSyntax = atoi(argv[++i]);
			if (global->maxHoleSpanSourceSyntax < 1) {
				cerr << "extract error: --maxHoleSourceSyntax should be at least 1" << endl;
				exit(1);
			}
		}
		
		// maximum number of words in hierarchical phrase
		else if (strcmp(argv[i],"--maxSymbols") == 0) {
			global->maxSymbols = atoi(argv[++i]);
			if (global->maxSymbols < 1) {
				cerr << "extract error: --maxSymbols should be at least 1" << endl;
				exit(1);
			}
		}
		// maximum number of non-terminals
		else if (strcmp(argv[i],"--MaxNonTerm") == 0) {
			global->maxNonTerm = atoi(argv[++i]);
			if (global->maxNonTerm < 1) {
				cerr << "extract error: --MaxNonTerm should be at least 1" << endl;
				exit(1);
			}
		}		
		// allow consecutive non-terminals (X Y | X Y)
    else if (strcmp(argv[i],"--TargetSyntax") == 0) {
      global->targetSyntax = true;
    }
    else if (strcmp(argv[i],"--SourceSyntax") == 0) {
      global->sourceSyntax = true;
    }
		// do not create many part00xx files!
    else if (strcmp(argv[i],"--NoFileLimit") == 0) {
      // now default
    }
		else if (strcmp(argv[i],"--GlueGrammar") == 0) {
			global->glueGrammarFlag = true;
			if (++i >= argc)
			{
				cerr << "ERROR: Option --GlueGrammar requires a file name" << endl;
				exit(0);
			}
			fileNameGlueGrammar = string(argv[i]);
			cerr << "creating glue grammar in '" << fileNameGlueGrammar << "'" << endl;
    }
		else if (strcmp(argv[i],"--UnknownWordLabel") == 0) {
			global->unknownWordLabelFlag = true;
			if (++i >= argc)
			{
				cerr << "ERROR: Option --UnknownWordLabel requires a file name" << endl;
				exit(0);
			}
			fileNameUnknownWordLabel = string(argv[i]);
			cerr << "creating unknown word labels in '" << fileNameUnknownWordLabel << "'" << endl;
		}
		// TODO: this should be a useful option
    //else if (strcmp(argv[i],"--ZipFiles") == 0) {
    //  zipFiles = true;
    //}
		// if an source phrase is paired with two target phrases, then count(t|s) = 0.5
    else if (strcmp(argv[i],"--Mixed") == 0) {
			global->mixed = true;
    }
		else if (strcmp(argv[i],"--AllowDefaultNonTermEdge") == 0) {
			global->allowDefaultNonTermEdge = atoi(argv[++i]);
    }
		else if (strcmp(argv[i], "--GZOutput") == 0) {
      global->gzOutput = true;
    }
		else if (strcmp(argv[i],"--MaxSpan") == 0) {
		  // ignore
      ++i;
		}
    else if (strcmp(argv[i],"--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    }
    else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }


	// open input files
	Moses::InputFileStream tFile(fileNameT);
	Moses::InputFileStream sFile(fileNameS);
	Moses::InputFileStream aFile(fileNameA);

	// open output files
  string fileNameExtractInv = fileNameExtract + ".inv";
  if (global->gzOutput) {
    fileNameExtract += ".gz";
    fileNameExtractInv += ".gz";
  }

  Moses::OutputFileStream extractFile;
  Moses::OutputFileStream extractFileInv;
  extractFile.Open(fileNameExtract.c_str());
  extractFileInv.Open(fileNameExtractInv.c_str());
  
  
	// loop through all sentence pairs
  int i = sentenceOffset;
  while(true) {
    i++;

    if (i % 1000 == 0) {
      cerr << i << " " << flush;
    }

    string targetString;
    string sourceString;
    string alignmentString;
		
		bool ok = getline(tFile, targetString);
		if (!ok)
			break;
		getline(sFile, sourceString);
		getline(aFile, alignmentString);
    
		//cerr << endl << targetString << endl << sourceString << endl << alignmentString << endl;

		//time_t currTime = time(NULL);
		//cerr << "A " << (currTime - starttime) << endl;

    SentenceAlignment sentencePair;
    if (sentencePair.Create( targetString, sourceString, alignmentString, i, *global )) 
		{			
			//cerr << sentence.sourceTree << endl;
			//cerr << sentence.targetTree << endl;

			sentencePair.FindTunnels(*g_global);
			//cerr << "C " << (time(NULL) - starttime) << endl;
			//cerr << sentencePair << endl;
			
			sentencePair.CreateLattice(*g_global);
			//cerr << "D " << (time(NULL) - starttime) << endl;
			//cerr << sentencePair << endl;

			sentencePair.CreateRules(*g_global);
			//cerr << "E " << (time(NULL) - starttime) << endl;

			//cerr << sentence.lattice->GetRules().GetSize() << endl;
			sentencePair.GetLattice().GetRules().Output(extractFile);
      sentencePair.GetLattice().GetRules().OutputInv(extractFileInv);
    }
  }
	
  tFile.Close();
  sFile.Close();
  aFile.Close();

  extractFile.Close();
  extractFileInv.Close();

  if (global->glueGrammarFlag) {
    writeGlueGrammar(fileNameGlueGrammar, *global, targetLabelCollection, targetTopLabelCollection);
  }

  delete global;
}
Esempio n. 8
0
int main(int argc, char* argv[])
{
  cerr << "Score v2.0 written by Philipp Koehn\n"
       << "scoring methods for extracted rules\n";

  if (argc < 4) {
    cerr << "syntax: score extract lex phrase-table [--Inverse] [--Hierarchical] [--LogProb] [--NegLogProb] [--NoLex] [--GoodTuring] [--KneserNey] [--WordAlignment] [--UnalignedPenalty] [--UnalignedFunctionWordPenalty function-word-file] [--MinCountHierarchical count] [--OutputNTLengths] [--PCFG] [--UnpairedExtractFormat] [--ConditionOnTargetLHS] [--[Sparse]Domain[Indicator|Ratio|Subset|Bin] domain-file [bins]] [--Singleton] [--CrossedNonTerm] \n";
    exit(1);
  }
  string fileNameExtract = argv[1];
  string fileNameLex = argv[2];
  string fileNamePhraseTable = argv[3];
  string fileNameCountOfCounts;
  char* fileNameFunctionWords = NULL;
  char* fileNameDomain = NULL;

  for(int i=4; i<argc; i++) {
    if (strcmp(argv[i],"inverse") == 0 || strcmp(argv[i],"--Inverse") == 0) {
      inverseFlag = true;
      cerr << "using inverse mode\n";
    } else if (strcmp(argv[i],"--Hierarchical") == 0) {
      hierarchicalFlag = true;
      cerr << "processing hierarchical rules\n";
    } else if (strcmp(argv[i],"--PCFG") == 0) {
      pcfgFlag = true;
      cerr << "including PCFG scores\n";
    } else if (strcmp(argv[i],"--UnpairedExtractFormat") == 0) {
      unpairedExtractFormatFlag = true;
      cerr << "processing unpaired extract format\n";
    } else if (strcmp(argv[i],"--ConditionOnTargetLHS") == 0) {
      conditionOnTargetLhsFlag = true;
      cerr << "processing unpaired extract format\n";
    } else if (strcmp(argv[i],"--WordAlignment") == 0) {
      wordAlignmentFlag = true;
      cerr << "outputing word alignment" << endl;
    } else if (strcmp(argv[i],"--NoLex") == 0) {
      lexFlag = false;
      cerr << "not computing lexical translation score\n";
    } else if (strcmp(argv[i],"--GoodTuring") == 0) {
      goodTuringFlag = true;
			fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
      cerr << "adjusting phrase translation probabilities with Good Turing discounting\n";
    } else if (strcmp(argv[i],"--KneserNey") == 0) {
      kneserNeyFlag = true;
			fileNameCountOfCounts = string(fileNamePhraseTable) + ".coc";
      cerr << "adjusting phrase translation probabilities with Kneser Ney discounting\n";
    } else if (strcmp(argv[i],"--UnalignedPenalty") == 0) {
      unalignedFlag = true;
      cerr << "using unaligned word penalty\n";
    } else if (strcmp(argv[i],"--UnalignedFunctionWordPenalty") == 0) {
      unalignedFWFlag = true;
      if (i+1==argc) { 
        cerr << "ERROR: specify count of count files for Kneser Ney discounting!\n";
        exit(1);
      }
      fileNameFunctionWords = argv[++i];
      cerr << "using unaligned function word penalty with function words from " << fileNameFunctionWords << endl;
    } else if (strcmp(argv[i],"--SparseDomainIndicator") == 0 ||
               strcmp(argv[i],"--SparseDomainRatio") == 0 ||
               strcmp(argv[i],"--SparseDomainSubset") == 0 ||
               strcmp(argv[i],"--DomainIndicator") == 0 ||
               strcmp(argv[i],"--DomainRatio") == 0 ||
               strcmp(argv[i],"--DomainSubset") == 0) {
      includeSentenceIdFlag = true;
      domainFlag = true;
      domainSparseFlag = strstr( argv[i], "Sparse" );
      domainRatioFlag = strstr( argv[i], "Ratio" );
      domainSubsetFlag = strstr( argv[i], "Subset" );
      if (i+1==argc) {
        cerr << "ERROR: specify domain info file with " << argv[i] << endl;
        exit(1);
      }
      fileNameDomain = argv[++i];
    } else if (strcmp(argv[i],"--LogProb") == 0) {
      logProbFlag = true;
      cerr << "using log-probabilities\n";
    } else if (strcmp(argv[i],"--NegLogProb") == 0) {
      logProbFlag = true;
      negLogProb = -1;
      cerr << "using negative log-probabilities\n";
    } else if (strcmp(argv[i],"--MinCountHierarchical") == 0) {
      minCountHierarchical = atof(argv[++i]);
      cerr << "dropping all phrase pairs occurring less than " << minCountHierarchical << " times\n";
      minCountHierarchical -= 0.00001; // account for rounding
    } else if (strcmp(argv[i],"--OutputNTLengths") == 0) {
      outputNTLengths = true;
    } else if (strcmp(argv[i],"--Singleton") == 0) {
      singletonFeature = true;
      cerr << "binary singleton feature\n";
    } else if (strcmp(argv[i],"--CrossedNonTerm") == 0) {
      crossedNonTerm = true;
      cerr << "crossed non-term reordering feature\n";
    } else {
      cerr << "ERROR: unknown option " << argv[i] << endl;
      exit(1);
    }
  }

  // lexical translation table
  if (lexFlag)
    lexTable.load( fileNameLex );

  // function word list
  if (unalignedFWFlag)
    loadFunctionWords( fileNameFunctionWords );

  // load domain information
  if (domainFlag) {
    if (inverseFlag) {
      domainFlag = false;
      includeSentenceIdFlag = false;
    }
    else {
      domain = new Domain;
      domain->load( fileNameDomain );
    }
  }

  // compute count of counts for Good Turing discounting
  if (goodTuringFlag || kneserNeyFlag) {
    for(int i=1; i<=COC_MAX; i++) countOfCounts[i] = 0;
  }

  // sorted phrase extraction file
  Moses::InputFileStream extractFile(fileNameExtract);

  if (extractFile.fail()) {
    cerr << "ERROR: could not open extract file " << fileNameExtract << endl;
    exit(1);
  }
  istream &extractFileP = extractFile;

  // output file: phrase translation table
	ostream *phraseTableFile;

	if (fileNamePhraseTable == "-") {
		phraseTableFile = &cout;
	}
	else {
		Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
		bool success = outputFile->Open(fileNamePhraseTable);
		if (!success) {
			cerr << "ERROR: could not open file phrase table file "
					 << fileNamePhraseTable << endl;
			exit(1);
		}
		phraseTableFile = outputFile;
	}
	
  // loop through all extracted phrase translations
  float lastCount = 0.0f;
  float lastPcfgSum = 0.0f;
  vector< PhraseAlignment > phrasePairsWithSameF;
  bool isSingleton = true;
  int i=0;
  char line[LINE_MAX_LENGTH],lastLine[LINE_MAX_LENGTH];
  lastLine[0] = '\0';
  PhraseAlignment *lastPhrasePair = NULL;
  while(true) {
    if (extractFileP.eof()) break;
    if (++i % 100000 == 0) cerr << "." << flush;
    SAFE_GETLINE((extractFileP), line, LINE_MAX_LENGTH, '\n', __FILE__);
    if (extractFileP.eof())	break;

    // identical to last line? just add count
    if (strcmp(line,lastLine) == 0) {
      lastPhrasePair->count += lastCount;
      lastPhrasePair->pcfgSum += lastPcfgSum;
      continue;
    }
    strcpy( lastLine, line );

    // create new phrase pair
    PhraseAlignment phrasePair;
    phrasePair.create( line, i, includeSentenceIdFlag );
    lastCount = phrasePair.count;
    lastPcfgSum = phrasePair.pcfgSum;

    // only differs in count? just add count
    if (lastPhrasePair != NULL 
	&& lastPhrasePair->equals( phrasePair )
	&& (!domainFlag
	    || domain->getDomainOfSentence( lastPhrasePair->sentenceId )
	    == domain->getDomainOfSentence( phrasePair.sentenceId ) )) {
      lastPhrasePair->count += phrasePair.count;
      lastPhrasePair->pcfgSum += phrasePair.pcfgSum;
      continue;
    }
    
    // if new source phrase, process last batch
    if (lastPhrasePair != NULL &&
        lastPhrasePair->GetSource() != phrasePair.GetSource()) {
      processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
      
      phrasePairsWithSameF.clear();
      isSingleton = false;
      lastPhrasePair = NULL;
    }
    else
    {
      isSingleton = true;
    }

    // add phrase pairs to list, it's now the last one
    phrasePairsWithSameF.push_back( phrasePair );
    lastPhrasePair = &phrasePairsWithSameF.back();
  }
  processPhrasePairs( phrasePairsWithSameF, *phraseTableFile, isSingleton );
	
	phraseTableFile->flush();
	if (phraseTableFile != &cout) {
		delete phraseTableFile;
	}

  // output count of count statistics
  if (goodTuringFlag || kneserNeyFlag) {
    writeCountOfCounts( fileNameCountOfCounts );
  }
}
int main(int argc, char* argv[])
{
  cerr	<< "PhraseExtract v1.4, written by Philipp Koehn\n"
        << "phrase extraction from an aligned parallel corpus\n";

  if (argc < 6) {
    cerr << "syntax: extract en de align extract max-length [orientation [ --model [wbe|phrase|hier]-[msd|mslr|mono] ] ";
    cerr<<"| --OnlyOutputSpanInfo | --NoTTable | --GZOutput | --IncludeSentenceId | --SentenceOffset n | --InstanceWeights filename ]\n";
    exit(1);
  }

  Moses::OutputFileStream extractFileOrientation;
  const char* const &fileNameE = argv[1];
  const char* const &fileNameF = argv[2];
  const char* const &fileNameA = argv[3];
  const string fileNameExtract = string(argv[4]);
  PhraseExtractionOptions options(atoi(argv[5]));

  for(int i=6; i<argc; i++) {
    if (strcmp(argv[i],"--OnlyOutputSpanInfo") == 0) {
      options.initOnlyOutputSpanInfo(true);
    } else if (strcmp(argv[i],"orientation") == 0 || strcmp(argv[i],"--Orientation") == 0) {
      options.initOrientationFlag(true);
    } else if (strcmp(argv[i],"--FlexibilityScore") == 0) {
      options.initFlexScoreFlag(true);
    } else if (strcmp(argv[i],"--NoTTable") == 0) {
      options.initTranslationFlag(false);
    } else if (strcmp(argv[i], "--IncludeSentenceId") == 0) {
      options.initIncludeSentenceIdFlag(true);
    } else if (strcmp(argv[i], "--SentenceOffset") == 0) {
      if (i+1 >= argc || argv[i+1][0] < '0' || argv[i+1][0] > '9') {
        cerr << "extract: syntax error, used switch --SentenceOffset without a number" << endl;
        exit(1);
      }
      sentenceOffset = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--GZOutput") == 0) {
      options.initGzOutput(true);
    } else if (strcmp(argv[i], "--InstanceWeights") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, used switch --InstanceWeights without file name" << endl;
        exit(1);
      }
      options.initInstanceWeightsFile(argv[++i]);
    } else if (strcmp(argv[i], "--Debug") == 0) {
    	options.debug = true;
    } else if (strcmp(argv[i], "--MinPhraseLength") == 0) {
    	options.minPhraseLength = atoi(argv[++i]);
    } else if (strcmp(argv[i], "--Separator") == 0) {
    	options.separator = argv[++i];
    } else if(strcmp(argv[i],"--model") == 0) {
      if (i+1 >= argc) {
        cerr << "extract: syntax error, no model's information provided to the option --model " << endl;
        exit(1);
      }
      char*  modelParams = argv[++i];
      char*  modelName = strtok(modelParams, "-");
      char*  modelType = strtok(NULL, "-");

      // REO_MODEL_TYPE intModelType;

      if(strcmp(modelName, "wbe") == 0) {
        options.initWordModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initWordType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initWordType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initWordType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "phrase") == 0) {
        options.initPhraseModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initPhraseType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initPhraseType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initPhraseType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else if(strcmp(modelName, "hier") == 0) {
        options.initHierModel(true);
        if(strcmp(modelType, "msd") == 0)
          options.initHierType(REO_MSD);
        else if(strcmp(modelType, "mslr") == 0)
          options.initHierType(REO_MSLR);
        else if(strcmp(modelType, "mono") == 0 || strcmp(modelType, "monotonicity") == 0)
          options.initHierType(REO_MONO);
        else {
          cerr << "extract: syntax error, unknown reordering model type: " << modelType << endl;
          exit(1);
        }
      } else {
        cerr << "extract: syntax error, unknown reordering model: " << modelName << endl;
        exit(1);
      }

      options.initAllModelsOutputFlag(true);
    } else {
      cerr << "extract: syntax error, unknown option '" << string(argv[i]) << "'\n";
      exit(1);
    }
  }

  // default reordering model if no model selected
  // allows for the old syntax to be used
  if(options.isOrientationFlag() && !options.isAllModelsOutputFlag()) {
    options.initWordModel(true);
    options.initWordType(REO_MSD);
  }

  // open input files
  Moses::InputFileStream eFile(fileNameE);
  Moses::InputFileStream fFile(fileNameF);
  Moses::InputFileStream aFile(fileNameA);

  istream *eFileP = &eFile;
  istream *fFileP = &fFile;
  istream *aFileP = &aFile;

  istream *iwFileP = NULL;
  auto_ptr<Moses::InputFileStream> instanceWeightsFile;
  if (options.getInstanceWeightsFile().length()) {
    instanceWeightsFile.reset(new Moses::InputFileStream(options.getInstanceWeightsFile()));
    iwFileP = instanceWeightsFile.get();
  }

  // open output files
  if (options.isOrientationFlag()) {
    string fileNameExtractOrientation = fileNameExtract + ".o" + (options.isGzOutput()?".gz":"");
    extractFileOrientation.Open(fileNameExtractOrientation.c_str());
  }

  int i = sentenceOffset;

  while(true) {
    i++;
    if (i%10000 == 0) cerr << "." << flush;
    char englishString[LINE_MAX_LENGTH];
    char foreignString[LINE_MAX_LENGTH];
    char alignmentString[LINE_MAX_LENGTH];
    char weightString[LINE_MAX_LENGTH];
    SAFE_GETLINE((*eFileP), englishString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (eFileP->eof()) break;
    SAFE_GETLINE((*fFileP), foreignString, LINE_MAX_LENGTH, '\n', __FILE__);
    SAFE_GETLINE((*aFileP), alignmentString, LINE_MAX_LENGTH, '\n', __FILE__);
    if (iwFileP) {
      SAFE_GETLINE((*iwFileP), weightString, LINE_MAX_LENGTH, '\n', __FILE__);
    }
    SentenceAlignment sentence;
    // cout << "read in: " << englishString << " & " << foreignString << " & " << alignmentString << endl;
    //az: output src, tgt, and alingment line
    if (options.isOnlyOutputSpanInfo()) {
      cout << "LOG: SRC: " << foreignString << endl;
      cout << "LOG: TGT: " << englishString << endl;
      cout << "LOG: ALT: " << alignmentString << endl;
      cout << "LOG: PHRASES_BEGIN:" << endl;
    }
    if (sentence.create( englishString, foreignString, alignmentString, weightString, i, false)) {
      ExtractTask *task = new ExtractTask(i-1, sentence, options, extractFileOrientation);
      task->Run();
      delete task;

    }
    if (options.isOnlyOutputSpanInfo()) cout << "LOG: PHRASES_END:" << endl; //az: mark end of phrases
  }

  eFile.Close();
  fFile.Close();
  aFile.Close();

  //az: only close if we actually opened it
  if (!options.isOnlyOutputSpanInfo()) {
    if (options.isOrientationFlag()) {
      extractFileOrientation.Close();
    }
  }
}
int main(int argc, char* argv[])
{
  cerr << "Starting..." << endl;

  char* &fileNameDirect = argv[1];
  Moses::InputFileStream fileDirect(fileNameDirect);


  //fileDirect.open(fileNameDirect);
  if (fileDirect.fail()) {
    cerr << "ERROR: could not open extract file " << fileNameDirect << endl;
    exit(1);
  }
  istream &fileDirectP = fileDirect;

  char* &fileNameConsolidated = argv[2];
  ostream *fileConsolidated;

  if (strcmp(fileNameConsolidated, "-") == 0) {
    fileConsolidated = &cout;
  } else {
    Moses::OutputFileStream *outputFile = new Moses::OutputFileStream();
    bool success = outputFile->Open(fileNameConsolidated);
    if (!success) {
      cerr << "ERROR: could not open file phrase table file "
           << fileNameConsolidated << endl;
      exit(1);
    }
    fileConsolidated = outputFile;
  }

  int i=0;
  while(true) {
    i++;
    if (i%1000 == 0) cerr << "." << flush;
    if (i%10000 == 0) cerr << ":" << flush;
    if (i%100000 == 0) cerr << "!" << flush;

    vector< string > itemDirect;
    if (! getLine(fileDirectP,  itemDirect  ))
      break;

    (*fileConsolidated) << itemDirect[0] << " ||| " << itemDirect[1] << " ||| ";

    // output alignment and probabilities
    (*fileConsolidated)	<< itemDirect[2]						// prob direct
                        << " 2.718" // phrase count feature
                        << " ||| " << itemDirect[3];	// alignment

    // counts
    (*fileConsolidated) << "||| 0 " << itemDirect[4]; // indirect
    (*fileConsolidated) << endl;

  }

  fileConsolidated->flush();
  if (fileConsolidated != &cout) {
    delete fileConsolidated;
  }

  cerr << "Finished" << endl;
}