LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::string inPath, std::string outPath, std::string tempfilePath, size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees, size_t quantize #ifdef WITH_THREADS , size_t threads #endif ) : m_inPath(inPath), m_outPath(outPath), m_tempfilePath(tempfilePath), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize), m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1) #ifdef WITH_THREADS , m_threads(threads) #endif { PrintInfo(); m_outFile = std::fopen(m_outPath.c_str(), "w"); std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); if(tempfilePath.size()) { MmapAllocator<unsigned char> allocEncoded(util::FMakeTemp(tempfilePath)); m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocEncoded); } else { m_encodedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(); } EncodeScores(); std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; CalcHuffmanCodes(); std::cerr << "Pass 2/2: Compressing scores" << std::endl; if(tempfilePath.size()) { MmapAllocator<unsigned char> allocCompressed(util::FMakeTemp(tempfilePath)); m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(allocCompressed); } else { m_compressedScores = new StringVector<unsigned char, unsigned long, MmapAllocator>(); } CompressScores(); std::cerr << "Saving to " << m_outPath << std::endl; Save(); std::cerr << "Done" << std::endl; std::fclose(m_outFile); }
std::string PhraseTableCreator::EncodeLine(std::vector<std::string>& tokens, size_t ownRank) { std::string sourcePhraseStr = tokens[0]; std::string targetPhraseStr = tokens[1]; std::string scoresStr = tokens[2]; std::string alignmentStr = ""; if(tokens.size() > 3) alignmentStr = tokens[3]; std::vector<std::string> s = Tokenize(sourcePhraseStr); size_t phraseLength = s.size(); if(m_maxPhraseLength < phraseLength) m_maxPhraseLength = phraseLength; std::vector<std::string> t = Tokenize(targetPhraseStr); std::vector<float> scores = Tokenize<float>(scoresStr); if(scores.size() != m_numScoreComponent) { std::cerr << "Error: Wrong number of scores detected (" << scores.size() << " != " << m_numScoreComponent << ") :" << std::endl; std::cerr << "Line: " << tokens[0] << " ||| " << tokens[1] << " ||| " << tokens[3] << " ..." << std::endl; abort(); } std::set<AlignPoint> a; if(m_coding != None || m_useAlignmentInfo) { std::vector<size_t> positions = Tokenize<size_t>(alignmentStr, " \t-"); for(size_t i = 0; i < positions.size(); i += 2) { a.insert(AlignPoint(positions[i], positions[i+1])); } } std::stringstream encodedTargetPhrase; if(m_coding == PREnc) { EncodeTargetPhrasePREnc(s, t, a, ownRank, encodedTargetPhrase); } else if(m_coding == REnc) { EncodeTargetPhraseREnc(s, t, a, encodedTargetPhrase); } else { EncodeTargetPhraseNone(t, encodedTargetPhrase); } EncodeScores(scores, encodedTargetPhrase); if(m_useAlignmentInfo) EncodeAlignment(a, encodedTargetPhrase); return encodedTargetPhrase.str(); }
LexicalReorderingTableCreator::LexicalReorderingTableCreator( std::string inPath, std::string outPath, size_t orderBits, size_t fingerPrintBits, bool multipleScoreTrees, size_t quantize #ifdef WITH_THREADS , size_t threads #endif ) : m_inPath(inPath), m_outPath(outPath), m_orderBits(orderBits), m_fingerPrintBits(fingerPrintBits), m_numScoreComponent(0), m_multipleScoreTrees(multipleScoreTrees), m_quantize(quantize), m_separator(" ||| "), m_hash(m_orderBits, m_fingerPrintBits), m_lastFlushedLine(-1) #ifdef WITH_THREADS , m_threads(threads) #endif { PrintInfo(); m_outFile = std::fopen(m_outPath.c_str(), "w"); std::cerr << "Pass 1/2: Creating phrase index + Counting scores" << std::endl; m_hash.BeginSave(m_outFile); EncodeScores(); std::cerr << "Intermezzo: Calculating Huffman code sets" << std::endl; CalcHuffmanCodes(); std::cerr << "Pass 2/2: Compressing scores" << std::endl; CompressScores(); std::cerr << "Saving to " << m_outPath << std::endl; Save(); std::cerr << "Done" << std::endl; std::fclose(m_outFile); }