예제 #1
0
void processFiles( const std::string& fileNameDirect, 
                   const std::string& fileNameIndirect, 
                   const std::string& fileNameConsolidated, 
                   const std::string& fileNameCountOfCounts, 
                   const std::string& fileNameSourceLabelSet, 
                   const std::string& fileNamePartsOfSpeechVocabulary )
{
  if (goodTuringFlag || kneserNeyFlag)
    loadCountOfCounts( fileNameCountOfCounts );

  // open input files
  Moses::InputFileStream fileDirect(fileNameDirect);
  UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect);
  Moses::InputFileStream fileIndirect(fileNameIndirect);
  UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect);

  // open output file: consolidated phrase table
  Moses::OutputFileStream fileConsolidated;
  bool success = fileConsolidated.Open(fileNameConsolidated);
  UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated);

  // create properties consolidator
  // (in case any additional phrase property requires further processing)
  MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
  if (sourceLabelsFlag) {
    propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
  }
  if (partsOfSpeechFlag) {
    propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary);
  }

  // loop through all extracted phrase translations
  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) std::cerr << "." << std::flush;

    std::vector< std::string > itemDirect, itemIndirect;
    if (! getLine(fileIndirect, itemIndirect) ||
        ! getLine(fileDirect, itemDirect))
      break;

    // direct: target source alignment probabilities
    // indirect: source target probabilities

    // consistency checks
    UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0, 
                   "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'");
    UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0, 
                   "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'");

    // SCORES ...
    std::string directScores, directSparseScores, indirectScores, indirectSparseScores;
    breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
    breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );

    std::vector<std::string> directCounts;
    Moses::Tokenize( directCounts, itemDirect[4] );
    std::vector<std::string> indirectCounts;
    Moses::Tokenize( indirectCounts, itemIndirect[4] );
    float countF = Moses::Scan<float>(directCounts[0]);
    float countE = Moses::Scan<float>(indirectCounts[0]);
    float countEF = Moses::Scan<float>(indirectCounts[1]);
    float n1_F, n1_E;
    if (kneserNeyFlag) {
      n1_F = Moses::Scan<float>(directCounts[2]);
      n1_E = Moses::Scan<float>(indirectCounts[2]);
    }

    // Good Turing discounting
    float adjustedCountEF = countEF;
    if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
      adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
    float adjustedCountEF_indirect = adjustedCountEF;

    // Kneser Ney discounting [Foster et al, 2006]
    if (kneserNeyFlag) {
      float D = kneserNey_D3;
      if (countEF < 2) D = kneserNey_D1;
      else if (countEF < 3) D = kneserNey_D2;
      if (D > countEF) D = countEF - 0.01; // sanity constraint

      float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
      float alpha_F = D * n1_F / countF; // available mass
      adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;

      // for indirect
      float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
      float alpha_E = D * n1_E / countE; // available mass
      adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
    }

    // drop due to MinScore thresholding
    if ((minScore0 > 0 && adjustedCountEF_indirect/countE < minScore0) ||
        (minScore2 > 0 && adjustedCountEF         /countF < minScore2)) {
      continue;
    }

    // output phrase pair
    fileConsolidated << itemDirect[0] << " ||| ";

    if (partsOfSpeechFlag) {
      // write POS factor from property
      std::vector<std::string> targetTokens;
      Moses::Tokenize( targetTokens, itemDirect[1] );
      std::vector<std::string> propertyValuePOS;
      propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS);
      size_t targetTerminalIndex = 0;
      for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin();
           targetTokensIt!=targetTokens.end(); ++targetTokensIt) {
        fileConsolidated << *targetTokensIt;
        if (!isNonTerminal(*targetTokensIt)) {
          assert(propertyValuePOS.size() > targetTerminalIndex);
          fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex];
          ++targetTerminalIndex;
        }
        fileConsolidated << " ";
      }
      fileConsolidated << "|||";

    } else {

      fileConsolidated << itemDirect[1] << " |||";
    }


    // prob indirect
    if (!onlyDirectFlag) {
      fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
      fileConsolidated << " " << indirectScores;
    }

    // prob direct
    fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
    fileConsolidated << " " << directScores;

    // phrase count feature
    if (phraseCountFlag) {
      fileConsolidated << " " << maybeLogProb(2.718);
    }

    // low count feature
    if (lowCountFlag) {
      fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF));
    }

    // count bin feature (as a core feature)
    if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " " << maybeLogProb(2.718);
          foundBin = true;
        } else {
          fileConsolidated << " " << maybeLogProb(1);
        }
      }
      fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
    }

    // alignment
    fileConsolidated << " |||";
    if (!itemDirect[2].empty()) {
      fileConsolidated << " " << itemDirect[2];;
    }

    // counts, for debugging
    fileConsolidated << " ||| " << countE << " " << countF << " " << countEF;

    // sparse features
    fileConsolidated << " |||";
    if (directSparseScores.compare("") != 0)
      fileConsolidated << " " << directSparseScores;
    if (indirectSparseScores.compare("") != 0)
      fileConsolidated << " " << indirectSparseScores;

    // count bin feature (as a sparse feature)
    if (sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " cb_";
          if (i == 0 && countBin[i] > 1)
            fileConsolidated << "1_";
          else if (i > 0 && countBin[i-1]+1 < countBin[i])
            fileConsolidated << (countBin[i-1]+1) << "_";
          fileConsolidated << countBin[i] << " 1";
          foundBin = true;
        }
      }
      if (!foundBin) {
        fileConsolidated << " cb_max 1";
      }
    }

    // arbitrary key-value pairs
    fileConsolidated << " |||";
    if (itemDirect.size() >= 6) {
      propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated);
    }

    if (countsProperty) {
      fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}";
    }

    fileConsolidated << std::endl;
  }

  fileDirect.Close();
  fileIndirect.Close();
  fileConsolidated.Close();
}
예제 #2
0
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet )
{
  if (goodTuringFlag || kneserNeyFlag)
    loadCountOfCounts( fileNameCountOfCounts );

  // open input files
  Moses::InputFileStream fileDirect(fileNameDirect);
  Moses::InputFileStream fileIndirect(fileNameIndirect);

  if (fileDirect.fail()) {
    cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl;
    exit(1);
  }
  istream &fileDirectP = fileDirect;

  if (fileIndirect.fail()) {
    cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl;
    exit(1);
  }
  istream &fileIndirectP = fileIndirect;

  // open output file: consolidated phrase table
  Moses::OutputFileStream fileConsolidated;
  bool success = fileConsolidated.Open(fileNameConsolidated);
  if (!success) {
    cerr << "ERROR: could not open output file " << fileNameConsolidated << endl;
    exit(1);
  }

  // create properties consolidator 
  // (in case any additional phrase property requires further processing)
  MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator();
  if (sourceLabelsFlag) {
    propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet);
  }

  // loop through all extracted phrase translations
  int i=0;
  while(true) {
    i++;
    if (i%100000 == 0) cerr << "." << flush;

    vector< string > itemDirect, itemIndirect;
    if (! getLine(fileIndirectP,itemIndirect) ||
        ! getLine(fileDirectP,  itemDirect  ))
      break;

    // direct: target source alignment probabilities
    // indirect: source target probabilities

    // consistency checks
    if (itemDirect[0].compare( itemIndirect[0] ) != 0) {
      cerr << "ERROR: target phrase does not match in line " << i << ": '"
           << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl;
      exit(1);
    }

    if (itemDirect[1].compare( itemIndirect[1] ) != 0) {
      cerr << "ERROR: source phrase does not match in line " << i << ": '"
           << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl;
      exit(1);
    }

    // output hierarchical phrase pair (with separated labels)
    fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||";

    // SCORES ...
    string directScores, directSparseScores, indirectScores, indirectSparseScores;
    breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores );
    breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores );

    vector<string> directCounts = tokenize(itemDirect[4].c_str());
    vector<string> indirectCounts = tokenize(itemIndirect[4].c_str());
    float countF = atof(directCounts[0].c_str());
    float countE = atof(indirectCounts[0].c_str());
    float countEF = atof(indirectCounts[1].c_str());
    float n1_F, n1_E;
    if (kneserNeyFlag) {
      n1_F = atof(directCounts[2].c_str());
      n1_E = atof(indirectCounts[2].c_str());
    }

    // Good Turing discounting
    float adjustedCountEF = countEF;
    if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1)
      adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)];
    float adjustedCountEF_indirect = adjustedCountEF;

    // Kneser Ney discounting [Foster et al, 2006]
    if (kneserNeyFlag) {
      float D = kneserNey_D3;
      if (countEF < 2) D = kneserNey_D1;
      else if (countEF < 3) D = kneserNey_D2;
      if (D > countEF) D = countEF - 0.01; // sanity constraint

      float p_b_E = n1_E / totalCount; // target phrase prob based on distinct
      float alpha_F = D * n1_F / countF; // available mass
      adjustedCountEF = countEF - D + countF * alpha_F * p_b_E;

      // for indirect
      float p_b_F = n1_F / totalCount; // target phrase prob based on distinct
      float alpha_E = D * n1_E / countE; // available mass
      adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F;
    }

    // prob indirect
    if (!onlyDirectFlag) {
      fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE);
      fileConsolidated << " " << indirectScores;
    }

    // prob direct
    fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF);
    fileConsolidated << " " << directScores;

    // phrase count feature
    if (phraseCountFlag) {
      fileConsolidated << " " << maybeLogProb(2.718);
    }

    // low count feature
    if (lowCountFlag) {
      fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF));
    }

    // count bin feature (as a core feature)
    if (countBin.size()>0 && !sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " " << maybeLogProb(2.718);
          foundBin = true;
        } else {
          fileConsolidated << " " << maybeLogProb(1);
        }
      }
      fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 );
    }

    // alignment
    fileConsolidated << " ||| " << itemDirect[2];

    // counts, for debugging
    fileConsolidated << "||| " << countE << " " << countF << " " << countEF;

    // sparse features
    fileConsolidated << " |||";
    if (directSparseScores.compare("") != 0)
      fileConsolidated << " " << directSparseScores;
    if (indirectSparseScores.compare("") != 0)
      fileConsolidated << " " << indirectSparseScores;
    // count bin feature (as a sparse feature)
    if (sparseCountBinFeatureFlag) {
      bool foundBin = false;
      for(size_t i=0; i < countBin.size(); i++) {
        if (!foundBin && countEF <= countBin[i]) {
          fileConsolidated << " cb_";
          if (i == 0 && countBin[i] > 1)
            fileConsolidated << "1_";
          else if (i > 0 && countBin[i-1]+1 < countBin[i])
            fileConsolidated << (countBin[i-1]+1) << "_";
          fileConsolidated << countBin[i] << " 1";
          foundBin = true;
        }
      }
      if (!foundBin) {
        fileConsolidated << " cb_max 1";
      }
    }

    // arbitrary key-value pairs
    fileConsolidated << " |||";
    if (itemDirect.size() >= 6) {
      //if (sourceLabelsFlag) {
        fileConsolidated << " " << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]);
      //} else {
      //  fileConsolidated << itemDirect[5];
      //}
    }

    fileConsolidated << endl;
  }
  fileDirect.Close();
  fileIndirect.Close();
  fileConsolidated.Close();
}