void processFiles( const std::string& fileNameDirect, const std::string& fileNameIndirect, const std::string& fileNameConsolidated, const std::string& fileNameCountOfCounts, const std::string& fileNameSourceLabelSet, const std::string& fileNamePartsOfSpeechVocabulary ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); // open input files Moses::InputFileStream fileDirect(fileNameDirect); UTIL_THROW_IF2(fileDirect.fail(), "could not open phrase table file " << fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); UTIL_THROW_IF2(fileIndirect.fail(), "could not open phrase table file " << fileNameIndirect); // open output file: consolidated phrase table Moses::OutputFileStream fileConsolidated; bool success = fileConsolidated.Open(fileNameConsolidated); UTIL_THROW_IF2(!success, "could not open output file " << fileNameConsolidated); // create properties consolidator // (in case any additional phrase property requires further processing) MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator(); if (sourceLabelsFlag) { propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet); } if (partsOfSpeechFlag) { propertiesConsolidator.ActivatePartsOfSpeechProcessing(fileNamePartsOfSpeechVocabulary); } // loop through all extracted phrase translations int i=0; while(true) { i++; if (i%100000 == 0) std::cerr << "." << std::flush; std::vector< std::string > itemDirect, itemIndirect; if (! getLine(fileIndirect, itemIndirect) || ! getLine(fileDirect, itemDirect)) break; // direct: target source alignment probabilities // indirect: source target probabilities // consistency checks UTIL_THROW_IF2(itemDirect[0].compare( itemIndirect[0] ) != 0, "target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'"); UTIL_THROW_IF2(itemDirect[1].compare( itemIndirect[1] ) != 0, "source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'"); // SCORES ... std::string directScores, directSparseScores, indirectScores, indirectSparseScores; breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores ); breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores ); std::vector<std::string> directCounts; Moses::Tokenize( directCounts, itemDirect[4] ); std::vector<std::string> indirectCounts; Moses::Tokenize( indirectCounts, itemIndirect[4] ); float countF = Moses::Scan<float>(directCounts[0]); float countE = Moses::Scan<float>(indirectCounts[0]); float countEF = Moses::Scan<float>(indirectCounts[1]); float n1_F, n1_E; if (kneserNeyFlag) { n1_F = Moses::Scan<float>(directCounts[2]); n1_E = Moses::Scan<float>(indirectCounts[2]); } // Good Turing discounting float adjustedCountEF = countEF; if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; float adjustedCountEF_indirect = adjustedCountEF; // Kneser Ney discounting [Foster et al, 2006] if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float alpha_F = D * n1_F / countF; // available mass adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; // for indirect float p_b_F = n1_F / totalCount; // target phrase prob based on distinct float alpha_E = D * n1_E / countE; // available mass adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; } // drop due to MinScore thresholding if ((minScore0 > 0 && adjustedCountEF_indirect/countE < minScore0) || (minScore2 > 0 && adjustedCountEF /countF < minScore2)) { continue; } // output phrase pair fileConsolidated << itemDirect[0] << " ||| "; if (partsOfSpeechFlag) { // write POS factor from property std::vector<std::string> targetTokens; Moses::Tokenize( targetTokens, itemDirect[1] ); std::vector<std::string> propertyValuePOS; propertiesConsolidator.GetPOSPropertyValueFromPropertiesString(itemDirect[5], propertyValuePOS); size_t targetTerminalIndex = 0; for (std::vector<std::string>::const_iterator targetTokensIt=targetTokens.begin(); targetTokensIt!=targetTokens.end(); ++targetTokensIt) { fileConsolidated << *targetTokensIt; if (!isNonTerminal(*targetTokensIt)) { assert(propertyValuePOS.size() > targetTerminalIndex); fileConsolidated << "|" << propertyValuePOS[targetTerminalIndex]; ++targetTerminalIndex; } fileConsolidated << " "; } fileConsolidated << "|||"; } else { fileConsolidated << itemDirect[1] << " |||"; } // prob indirect if (!onlyDirectFlag) { fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); fileConsolidated << " " << indirectScores; } // prob direct fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); fileConsolidated << " " << directScores; // phrase count feature if (phraseCountFlag) { fileConsolidated << " " << maybeLogProb(2.718); } // low count feature if (lowCountFlag) { fileConsolidated << " " << maybeLogProb(std::exp(-1.0/countEF)); } // count bin feature (as a core feature) if (countBin.size()>0 && !sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " " << maybeLogProb(2.718); foundBin = true; } else { fileConsolidated << " " << maybeLogProb(1); } } fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); } // alignment fileConsolidated << " |||"; if (!itemDirect[2].empty()) { fileConsolidated << " " << itemDirect[2];; } // counts, for debugging fileConsolidated << " ||| " << countE << " " << countF << " " << countEF; // sparse features fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; if (indirectSparseScores.compare("") != 0) fileConsolidated << " " << indirectSparseScores; // count bin feature (as a sparse feature) if (sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " cb_"; if (i == 0 && countBin[i] > 1) fileConsolidated << "1_"; else if (i > 0 && countBin[i-1]+1 < countBin[i]) fileConsolidated << (countBin[i-1]+1) << "_"; fileConsolidated << countBin[i] << " 1"; foundBin = true; } } if (!foundBin) { fileConsolidated << " cb_max 1"; } } // arbitrary key-value pairs fileConsolidated << " |||"; if (itemDirect.size() >= 6) { propertiesConsolidator.ProcessPropertiesString(itemDirect[5], fileConsolidated); } if (countsProperty) { fileConsolidated << " {{Counts " << countE << " " << countF << " " << countEF << "}}"; } fileConsolidated << std::endl; } fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); }
void processFiles( char* fileNameDirect, char* fileNameIndirect, char* fileNameConsolidated, char* fileNameCountOfCounts, char* fileNameSourceLabelSet ) { if (goodTuringFlag || kneserNeyFlag) loadCountOfCounts( fileNameCountOfCounts ); // open input files Moses::InputFileStream fileDirect(fileNameDirect); Moses::InputFileStream fileIndirect(fileNameIndirect); if (fileDirect.fail()) { cerr << "ERROR: could not open phrase table file " << fileNameDirect << endl; exit(1); } istream &fileDirectP = fileDirect; if (fileIndirect.fail()) { cerr << "ERROR: could not open phrase table file " << fileNameIndirect << endl; exit(1); } istream &fileIndirectP = fileIndirect; // open output file: consolidated phrase table Moses::OutputFileStream fileConsolidated; bool success = fileConsolidated.Open(fileNameConsolidated); if (!success) { cerr << "ERROR: could not open output file " << fileNameConsolidated << endl; exit(1); } // create properties consolidator // (in case any additional phrase property requires further processing) MosesTraining::PropertiesConsolidator propertiesConsolidator = MosesTraining::PropertiesConsolidator(); if (sourceLabelsFlag) { propertiesConsolidator.ActivateSourceLabelsProcessing(fileNameSourceLabelSet); } // loop through all extracted phrase translations int i=0; while(true) { i++; if (i%100000 == 0) cerr << "." << flush; vector< string > itemDirect, itemIndirect; if (! getLine(fileIndirectP,itemIndirect) || ! getLine(fileDirectP, itemDirect )) break; // direct: target source alignment probabilities // indirect: source target probabilities // consistency checks if (itemDirect[0].compare( itemIndirect[0] ) != 0) { cerr << "ERROR: target phrase does not match in line " << i << ": '" << itemDirect[0] << "' != '" << itemIndirect[0] << "'" << endl; exit(1); } if (itemDirect[1].compare( itemIndirect[1] ) != 0) { cerr << "ERROR: source phrase does not match in line " << i << ": '" << itemDirect[1] << "' != '" << itemIndirect[1] << "'" << endl; exit(1); } // output hierarchical phrase pair (with separated labels) fileConsolidated << itemDirect[0] << " ||| " << itemDirect[1] << " |||"; // SCORES ... string directScores, directSparseScores, indirectScores, indirectSparseScores; breakdownCoreAndSparse( itemDirect[3], directScores, directSparseScores ); breakdownCoreAndSparse( itemIndirect[3], indirectScores, indirectSparseScores ); vector<string> directCounts = tokenize(itemDirect[4].c_str()); vector<string> indirectCounts = tokenize(itemIndirect[4].c_str()); float countF = atof(directCounts[0].c_str()); float countE = atof(indirectCounts[0].c_str()); float countEF = atof(indirectCounts[1].c_str()); float n1_F, n1_E; if (kneserNeyFlag) { n1_F = atof(directCounts[2].c_str()); n1_E = atof(indirectCounts[2].c_str()); } // Good Turing discounting float adjustedCountEF = countEF; if (goodTuringFlag && countEF+0.99999 < goodTuringDiscount.size()-1) adjustedCountEF *= goodTuringDiscount[(int)(countEF+0.99998)]; float adjustedCountEF_indirect = adjustedCountEF; // Kneser Ney discounting [Foster et al, 2006] if (kneserNeyFlag) { float D = kneserNey_D3; if (countEF < 2) D = kneserNey_D1; else if (countEF < 3) D = kneserNey_D2; if (D > countEF) D = countEF - 0.01; // sanity constraint float p_b_E = n1_E / totalCount; // target phrase prob based on distinct float alpha_F = D * n1_F / countF; // available mass adjustedCountEF = countEF - D + countF * alpha_F * p_b_E; // for indirect float p_b_F = n1_F / totalCount; // target phrase prob based on distinct float alpha_E = D * n1_E / countE; // available mass adjustedCountEF_indirect = countEF - D + countE * alpha_E * p_b_F; } // prob indirect if (!onlyDirectFlag) { fileConsolidated << " " << maybeLogProb(adjustedCountEF_indirect/countE); fileConsolidated << " " << indirectScores; } // prob direct fileConsolidated << " " << maybeLogProb(adjustedCountEF/countF); fileConsolidated << " " << directScores; // phrase count feature if (phraseCountFlag) { fileConsolidated << " " << maybeLogProb(2.718); } // low count feature if (lowCountFlag) { fileConsolidated << " " << maybeLogProb(exp(-1.0/countEF)); } // count bin feature (as a core feature) if (countBin.size()>0 && !sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " " << maybeLogProb(2.718); foundBin = true; } else { fileConsolidated << " " << maybeLogProb(1); } } fileConsolidated << " " << maybeLogProb( foundBin ? 1 : 2.718 ); } // alignment fileConsolidated << " ||| " << itemDirect[2]; // counts, for debugging fileConsolidated << "||| " << countE << " " << countF << " " << countEF; // sparse features fileConsolidated << " |||"; if (directSparseScores.compare("") != 0) fileConsolidated << " " << directSparseScores; if (indirectSparseScores.compare("") != 0) fileConsolidated << " " << indirectSparseScores; // count bin feature (as a sparse feature) if (sparseCountBinFeatureFlag) { bool foundBin = false; for(size_t i=0; i < countBin.size(); i++) { if (!foundBin && countEF <= countBin[i]) { fileConsolidated << " cb_"; if (i == 0 && countBin[i] > 1) fileConsolidated << "1_"; else if (i > 0 && countBin[i-1]+1 < countBin[i]) fileConsolidated << (countBin[i-1]+1) << "_"; fileConsolidated << countBin[i] << " 1"; foundBin = true; } } if (!foundBin) { fileConsolidated << " cb_max 1"; } } // arbitrary key-value pairs fileConsolidated << " |||"; if (itemDirect.size() >= 6) { //if (sourceLabelsFlag) { fileConsolidated << " " << propertiesConsolidator.ProcessPropertiesString(itemDirect[5]); //} else { // fileConsolidated << itemDirect[5]; //} } fileConsolidated << endl; } fileDirect.Close(); fileIndirect.Close(); fileConsolidated.Close(); }