int main (int argc, char * const argv[]) { // insert code here... Moses::ResetUserTime(); Moses::PrintUserTime("Starting"); if (argc != 8) { std::cerr << "Usage: " << argv[0] << " numSourceFactors numTargetFactors numScores tableLimit sortScoreIndex inputPath outputPath" << std::endl; return 1; } int numSourceFactors = Moses::Scan<int>(argv[1]) , numTargetFactors = Moses::Scan<int>(argv[2]) , numScores = Moses::Scan<int>(argv[3]) , tableLimit = Moses::Scan<int>(argv[4]); TargetPhraseCollection::s_sortScoreInd = Moses::Scan<int>(argv[5]); assert(TargetPhraseCollection::s_sortScoreInd < numScores); const string filePath = argv[6] ,destPath = argv[7]; Moses::InputFileStream inStream(filePath); OnDiskWrapper onDiskWrapper; bool retDb = onDiskWrapper.BeginSave(destPath, numSourceFactors, numTargetFactors, numScores); assert(retDb); PhraseNode &rootNode = onDiskWrapper.GetRootSourceNode(); size_t lineNum = 0; char line[100000]; //while(getline(inStream, line)) while(inStream.getline(line, 100000)) { lineNum++; if (lineNum%1000 == 0) cerr << "." << flush; if (lineNum%10000 == 0) cerr << ":" << flush; if (lineNum%100000 == 0) cerr << lineNum << flush; //cerr << lineNum << " " << line << endl; std::vector<float> misc(1); SourcePhrase sourcePhrase; TargetPhrase *targetPhrase = new TargetPhrase(numScores); OnDiskPt::PhrasePtr spShort = Tokenize(sourcePhrase, *targetPhrase, line, onDiskWrapper, numScores, misc); assert(misc.size() == onDiskWrapper.GetNumCounts()); rootNode.AddTargetPhrase(sourcePhrase, targetPhrase, onDiskWrapper, tableLimit, misc, spShort); } rootNode.Save(onDiskWrapper, 0, tableLimit); onDiskWrapper.EndSave(); Moses::PrintUserTime("Finished"); //pause(); return 0; } // main()
void PhraseNode::GetChild(Word &wordFound, UINT64 &childFilePos, size_t ind, OnDiskWrapper &onDiskWrapper) const { size_t wordSize = onDiskWrapper.GetSourceWordSize(); size_t childSize = wordSize + sizeof(UINT64); char *currMem = m_memLoad + sizeof(UINT64) * 2 // size & file pos of target phrase coll + sizeof(float) * onDiskWrapper.GetNumCounts() // count info + childSize * ind; size_t memRead = ReadChild(wordFound, childFilePos, currMem); CHECK(memRead == childSize); }
PhraseNode::PhraseNode(UINT64 filePos, OnDiskWrapper &onDiskWrapper) :m_counts(onDiskWrapper.GetNumCounts()) { // load saved node m_filePos = filePos; size_t countSize = onDiskWrapper.GetNumCounts(); std::fstream &file = onDiskWrapper.GetFileSource(); file.seekg(filePos); CHECK(filePos == (UINT64)file.tellg()); file.read((char*) &m_numChildrenLoad, sizeof(UINT64)); size_t memAlloc = GetNodeSize(m_numChildrenLoad, onDiskWrapper.GetSourceWordSize(), countSize); m_memLoad = (char*) malloc(memAlloc); // go to start of node again file.seekg(filePos); CHECK(filePos == (UINT64)file.tellg()); // read everything into memory file.read(m_memLoad, memAlloc); CHECK(filePos + memAlloc == (UINT64)file.tellg()); // get value m_value = ((UINT64*)m_memLoad)[1]; // get counts float *memFloat = (float*) (m_memLoad + sizeof(UINT64) * 2); CHECK(countSize == 1); m_counts[0] = memFloat[0]; m_memLoadLast = m_memLoad + memAlloc; }
void PhraseNode::Save(OnDiskWrapper &onDiskWrapper, size_t pos, size_t tableLimit) { CHECK(!m_saved); // save this node m_targetPhraseColl.Sort(tableLimit); m_targetPhraseColl.Save(onDiskWrapper); m_value = m_targetPhraseColl.GetFilePos(); size_t numCounts = onDiskWrapper.GetNumCounts(); size_t memAlloc = GetNodeSize(GetSize(), onDiskWrapper.GetSourceWordSize(), numCounts); char *mem = (char*) malloc(memAlloc); //memset(mem, 0xfe, memAlloc); size_t memUsed = 0; UINT64 *memArray = (UINT64*) mem; memArray[0] = GetSize(); // num of children memArray[1] = m_value; // file pos of corresponding target phrases memUsed += 2 * sizeof(UINT64); // count info float *memFloat = (float*) (mem + memUsed); CHECK(numCounts == 1); memFloat[0] = (m_counts.size() == 0) ? DEFAULT_COUNT : m_counts[0]; // if count = 0, put in very large num to make sure its still used. HACK memUsed += sizeof(float) * numCounts; // recursively save chm_countsildren ChildColl::iterator iter; for (iter = m_children.begin(); iter != m_children.end(); ++iter) { const Word &childWord = iter->first; PhraseNode &childNode = iter->second; // recursive if (!childNode.Saved()) childNode.Save(onDiskWrapper, pos + 1, tableLimit); char *currMem = mem + memUsed; size_t wordMemUsed = childWord.WriteToMemory(currMem); memUsed += wordMemUsed; UINT64 *memArray = (UINT64*) (mem + memUsed); memArray[0] = childNode.GetFilePos(); memUsed += sizeof(UINT64); } // save this node //Moses::DebugMem(mem, memAlloc); CHECK(memUsed == memAlloc); std::fstream &file = onDiskWrapper.GetFileSource(); m_filePos = file.tellp(); file.seekp(0, ios::end); file.write(mem, memUsed); UINT64 endPos = file.tellp(); CHECK(m_filePos + memUsed == endPos); free(mem); m_children.clear(); m_saved = true; }