void TargetPhraseCollection::ReadFromFile(size_t tableLimit, UINT64 filePos, OnDiskWrapper &onDiskWrapper) { fstream &fileTPColl = onDiskWrapper.GetFileTargetColl(); fstream &fileTP = onDiskWrapper.GetFileTargetInd(); size_t numScores = onDiskWrapper.GetNumScores(); UINT64 numPhrases; UINT64 currFilePos = filePos; fileTPColl.seekg(filePos); fileTPColl.read((char*) &numPhrases, sizeof(UINT64)); // table limit numPhrases = std::min(numPhrases, (UINT64) tableLimit); currFilePos += sizeof(UINT64); for (size_t ind = 0; ind < numPhrases; ++ind) { TargetPhrase *tp = new TargetPhrase(numScores); UINT64 sizeOtherInfo = tp->ReadOtherInfoFromFile(currFilePos, fileTPColl); tp->ReadFromFile(fileTP); currFilePos += sizeOtherInfo; m_coll.push_back(tp); } }
int main(int argc, char **argv) { int tableLimit = 20; std::string ttable = ""; bool useAlignments = false; for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-tlimit")) { if(i + 1 == argc) usage(); tableLimit = atoi(argv[++i]); } else if(!strcmp(argv[i], "-t")) { if(i + 1 == argc) usage(); ttable = argv[++i]; } else usage(); } if(ttable == "") usage(); OnDiskWrapper onDiskWrapper; bool retDb = onDiskWrapper.BeginLoad(ttable); CHECK(retDb); OnDiskQuery onDiskQuery(onDiskWrapper); cerr << "Ready..." << endl; std::string line; while(getline(std::cin, line)) { std::vector<std::string> tokens; tokens = Moses::Tokenize(line, " "); cerr << "line: " << line << endl; const PhraseNode* node = onDiskQuery.Query(tokens); if (node) { // source phrase points to a bunch of rules const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper); string str = coll->GetDebugStr(); cout << "Found " << coll->GetSize() << endl; for (size_t ind = 0; ind < coll->GetSize(); ++ind) { const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind); cerr << " "; targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab()); cerr << endl; } } else { cout << "Not found" << endl; } std::cout << '\n'; std::cout.flush(); } cerr << "Finished." << endl; }
int main (int argc, char * const argv[]) { // insert code here... Moses::ResetUserTime(); Moses::PrintUserTime("Starting"); if (argc != 8) { std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl; return 1; } int numSourceFactors = Moses::Scan<int>(argv[1]) , numTargetFactors = Moses::Scan<int>(argv[2]) , numScores = Moses::Scan<int>(argv[3]) , tableLimit = Moses::Scan<int>(argv[4]); TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]); assert(TargetPhraseCollection::s_sortScoreInd < numScores); const string filePath = argv[6] ,destPath = argv[7]; Moses::InputFileStream inStream(filePath); OnDiskWrapper onDiskWrapper; bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores); assert(retDb); PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode(); size_t lineNum = 0; char line[100000]; //while(getline(inStream, line)) while(inStream.getline(line, 100000)) { lineNum++; if (lineNum%1000 == 0) cerr << "." << flush; if (lineNum%10000 == 0) cerr << ":" << flush; if (lineNum%100000 == 0) cerr << lineNum << flush; //cerr << lineNum << " " << line << endl; std::vector<float> misc(1); SourcePhrase sourcePhrase; TargetPhrase *targetPhrase = new TargetPhrase(numScores); OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); assert(misc.size() == onDiskWrapper.GetNumCounts()); rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort); } rootNode.Save(onDiskWrapper, 0, tableLimit); onDiskWrapper.EndSave(); Moses::PrintUserTime("Finished"); //pause(); return 0; } // main()
void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const { size_t wordSize = onDiskWrapper.GetSourceWordSize(); size_t childSize = wordSize + sizeof(UINT64); char *currMem = m_memLoad + sizeof(UINT64) * 2 // size & file pos of target phrase coll + sizeof(float) * onDiskWrapper.GetNumCounts() // count info + childSize * ind; size_t memRead = ReadChild(wordFound, childFilePos, currMem); CHECK(memRead == childSize); }
bool Vocab::Load(OnDiskWrapper &onDiskWrapper) { fstream &file = onDiskWrapper.GetFileVocab(); string line; while(getline(file, line)) { vector<string> tokens; Moses::Tokenize(tokens, line); assert(tokens.size() == 2); const string &key = tokens[0]; m_vocabColl[key] = Moses::Scan<UINT64>(tokens[1]); } // create lookup // assume contiguous vocab id m_lookup.resize(m_vocabColl.size() + 1); CollType::const_iterator iter; for (iter = m_vocabColl.begin(); iter != m_vocabColl.end(); ++iter) { UINT32 vocabId = iter->second; const std::string &word = iter->first; m_lookup[vocabId] = word; } return true; }
void Vocab::Save(OnDiskWrapper &onDiskWrapper) { fstream &file = onDiskWrapper.GetFileVocab(); CollType::const_iterator iterVocab; for (iterVocab = m_vocabColl.begin(); iterVocab != m_vocabColl.end(); ++iterVocab) { const string &word = iterVocab->first; UINT32 vocabId = iterVocab->second; file << word << " " << vocabId << endl; } }
PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper) :m_counts(onDiskWrapper.GetNumCounts()) { // load saved node m_filePos = filePos; size_t countSize = onDiskWrapper.GetNumCounts(); std::fstream &file = onDiskWrapper.GetFileSource(); file.seekg(filePos); CHECK(filePos == (UINT64)file.tellg()); file.read((char*) &m_numChildrenLoad, sizeof(UINT64)); size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize); m_memLoad = (char*) malloc(memAlloc); // go to start of node again file.seekg(filePos); CHECK(filePos == (UINT64)file.tellg()); // read everything into memory file.read(m_memLoad, memAlloc); CHECK(filePos + memAlloc == (UINT64)file.tellg()); // get value m_value = ((UINT64*)m_memLoad)[1]; // get counts float *memFloat = (float*) (m_memLoad + sizeof(UINT64) * 2); CHECK(countSize == 1); m_counts[0] = memFloat[0]; m_memLoadLast = m_memLoad + memAlloc; }
void TargetPhraseCollection::Save(OnDiskWrapper &onDiskWrapper) { std::fstream &file = onDiskWrapper.GetFileTargetColl(); size_t memUsed = sizeof(uint64_t); char *mem = (char*) malloc(memUsed); // size of coll uint64_t numPhrases = GetSize(); ((uint64_t*)mem)[0] = numPhrases; // MAIN LOOP CollType::iterator iter; for (iter = m_coll.begin(); iter != m_coll.end(); ++iter) { // save phrase TargetPhrase &targetPhrase = **iter; targetPhrase.Save(onDiskWrapper); // save coll size_t memUsedTPOtherInfo; char *memTPOtherInfo = targetPhrase.WriteOtherInfoToMemory(onDiskWrapper, memUsedTPOtherInfo); // expand existing mem mem = (char*) realloc(mem, memUsed + memUsedTPOtherInfo); memcpy(mem + memUsed, memTPOtherInfo, memUsedTPOtherInfo); memUsed += memUsedTPOtherInfo; free(memTPOtherInfo); } // total number of bytes //((uint64_t*)mem)[0] = (uint64_t) memUsed; uint64_t startPos = file.tellp(); file.seekp(0, ios::end); file.write((char*) mem, memUsed); free(mem); #ifndef NDEBUG uint64_t endPos = file.tellp(); assert(startPos + memUsed == endPos); #endif m_filePos = startPos; }
void TargetPhrase::Save(OnDiskWrapper &onDiskWrapper) { // save in target ind size_t memUsed; char *mem = WriteToMemory(onDiskWrapper, memUsed); std::fstream &file = onDiskWrapper.GetFileTargetInd(); UINT64 startPos = file.tellp(); file.seekp(0, ios::end); file.write(mem, memUsed); UINT64 endPos = file.tellp(); CHECK(startPos + memUsed == endPos); m_filePos = startPos; free(mem); }
void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit) { CHECK(!m_saved); // save this node m_targetPhraseColl.Sort(tableLimit); m_targetPhraseColl.Save(onDiskWrapper); m_value = m_targetPhraseColl.GetFilePos(); size_t numCounts = onDiskWrapper.GetNumCounts(); size_t memAlloc = GetNodeSize(GetSize(), onDiskWrapper.GetSourceWordSize(), numCounts); char *mem = (char*) malloc(memAlloc); //memset(mem, 0xfe, memAlloc); size_t memUsed = 0; UINT64 *memArray = (UINT64*) mem; memArray[0] = GetSize(); // num of children memArray[1] = m_value; // file pos of corresponding target phrases memUsed += 2 * sizeof(UINT64); // count info float *memFloat = (float*) (mem + memUsed); CHECK(numCounts == 1); memFloat[0] = (m_counts.size() == 0) ? DEFAULT_COUNT : m_counts[0]; // if count = 0, put in very large num to make sure its still used. HACK memUsed += sizeof(float) * numCounts; // recursively save chm_countsildren ChildColl::iterator iter; for (iter = m_children.begin(); iter != m_children.end(); ++iter) { const Word &childWord = iter->first; PhraseNode &childNode = iter->second; // recursive if (!childNode.Saved()) childNode.Save(onDiskWrapper, pos + 1, tableLimit); char *currMem = mem + memUsed; size_t wordMemUsed = childWord.WriteToMemory(currMem); memUsed += wordMemUsed; UINT64 *memArray = (UINT64*) (mem + memUsed); memArray[0] = childNode.GetFilePos(); memUsed += sizeof(UINT64); } // save this node //Moses::DebugMem(mem, memAlloc); CHECK(memUsed == memAlloc); std::fstream &file = onDiskWrapper.GetFileSource(); m_filePos = file.tellp(); file.seekp(0, ios::end); file.write(mem, memUsed); UINT64 endPos = file.tellp(); CHECK(m_filePos + memUsed == endPos); free(mem); m_children.clear(); m_saved = true; }
int main(int argc, char **argv) { int tableLimit = 20; std::string ttable = ""; bool useAlignments = false; for(int i = 1; i < argc; i++) { if(!strcmp(argv[i], "-tlimit")) { if(i + 1 == argc) usage(); tableLimit = atoi(argv[++i]); } else if(!strcmp(argv[i], "-t")) { if(i + 1 == argc) usage(); ttable = argv[++i]; } else usage(); } if(ttable == "") usage(); OnDiskWrapper onDiskWrapper; bool retDb = onDiskWrapper.BeginLoad(ttable); CHECK(retDb); cerr << "Ready..." << endl; std::string line; while(getline(std::cin, line)) { std::vector<std::string> tokens; tokens = Moses::Tokenize(line, " "); cerr << "line: " << line << endl; // create source phrase SourcePhrase sourcePhrase; for (size_t pos = 0; pos < tokens.size(); ++pos) { const string &tok = tokens[pos]; if (pos == tokens.size() - 1) { // last position. LHS non-term Tokenize(sourcePhrase, tok, false, true, onDiskWrapper); } else { Tokenize(sourcePhrase, tok, true, true, onDiskWrapper); } } const PhraseNode *node = &onDiskWrapper.GetRootSourceNode(); cerr << "node=" << node << endl; assert(node); for (size_t pos = 0; pos < sourcePhrase.GetSize(); ++pos) { const Word &word = sourcePhrase.GetWord(pos); cerr << word << " "; node = node->GetChild(word, onDiskWrapper); cerr << "node=" << node << endl; if (node == NULL) { break; } } if (node) { // source phrase points to a bunch of rules const TargetPhraseCollection *coll = node->GetTargetPhraseCollection(tableLimit, onDiskWrapper); string str = coll->GetDebugStr(); cout << "Found " << coll->GetSize() << endl; for (size_t ind = 0; ind < coll->GetSize(); ++ind) { const TargetPhrase &targetPhrase = coll->GetTargetPhrase(ind); cerr << " "; targetPhrase.DebugPrint(cerr, onDiskWrapper.GetVocab()); cerr << endl; } } else { cout << "Not found" << endl; } std::cout << '\n'; std::cout.flush(); } cerr << "Finished." << endl; }