void KwayMergeSort::WriteToTempFile(const vector<T> &lineBuffer) { TempFile *temp = new TempFile(_tempPath + _inFile); // write the contents of the current buffer to the temp file for (size_t i = 0; i < lineBuffer.size(); ++i) { temp->write(strlen(lineBuffer[i].c_str()), lineBuffer[i].c_str()); temp->write(1, "\n"); } temp->close(); _vTempFileNames.push_back(temp->getFile()); delete temp; }
//--------------------------------------------------------------------------- static void buildDictionary(TempFile& rawStrings,TempFile& stringTable,TempFile& stringIds,map<unsigned,unsigned>& subTypes) // Build the dictionary { cerr << "Building the dictionary..." << endl; // Sort the strings to resolve duplicates TempFile sortedStrings(rawStrings.getBaseFile()); Sorter::sort(rawStrings,sortedStrings,skipStringIdId,compareStringIdId); rawStrings.discard(); // Build the id map and the string list TempFile rawIdMap(rawStrings.getBaseFile()),stringList(rawStrings.getBaseFile()); { MemoryMappedFile strings; ensure(strings.open(sortedStrings.getFile().c_str())); uint64_t lastId=0; unsigned lastLen=0; const char* lastStr=0; uint64_t lastType=0; for (const char* iter=strings.getBegin(),*limit=strings.getEnd();iter!=limit;) { // Read the entry unsigned stringLen; const char* stringStart; iter=TempFile::readString(iter,stringLen,stringStart); uint64_t id,type; iter=TempFile::readId(iter,type); iter=TempFile::readId(iter,id); // A new one? if ((!lastStr)||(stringLen!=lastLen)||(memcmp(lastStr,stringStart,stringLen)!=0)||(type!=lastType)) { stringList.writeId(id); stringList.writeString(stringLen,stringStart); stringList.writeId(type); rawIdMap.writeId(id); rawIdMap.writeId(id); lastId=id; lastLen=stringLen; lastStr=stringStart; lastType=type; } else { rawIdMap.writeId(lastId); rawIdMap.writeId(id); } } } sortedStrings.discard(); // Sort the string list Sorter::sort(stringList,stringTable,skipIdStringId,compareId); stringList.discard(); // Sort the ID map TempFile idMap(rawStrings.getBaseFile()); Sorter::sort(rawIdMap,idMap,skipIdId,compareId); rawIdMap.discard(); // Construct new ids TempFile newIds(rawStrings.getBaseFile()); { MemoryMappedFile in; ensure(in.open(idMap.getFile().c_str())); uint64_t lastId=0,newId=0; for (const char* iter=in.getBegin(),*limit=in.getEnd();iter!=limit;) { uint64_t firstId,currentId; iter=TempFile::readId(iter,firstId); iter=TempFile::readId(iter,currentId); if (firstId!=lastId) { ++newId; lastId=firstId; } newIds.writeId(currentId); newIds.writeId(newId); if (subTypes.count(currentId)) subTypes[currentId]=newId; } } // And a final sort Sorter::sort(newIds,stringIds,skipIdId,compareValue); newIds.discard(); // Resolve the subtypes if necessary if (!subTypes.empty()) { TempFile fixedTypes(rawStrings.getBaseFile()); MemoryMappedFile in; ensure(in.open(stringTable.getFile().c_str())); for (const char* iter=in.getBegin(),*limit=in.getEnd();iter!=limit;) { uint64_t id,typeInfo; const char* value; unsigned valueLen; iter=TempFile::readId(TempFile::readString(TempFile::readId(iter,id),valueLen,value),typeInfo); unsigned type=typeInfo&0xFF,subType=(typeInfo>>8); if (Type::hasSubType(static_cast<Type::ID>(type))) { assert(subTypes.count(subType)); typeInfo=type|(subTypes[subType]<<8); } else { assert(subType==0); } fixedTypes.writeId(id); fixedTypes.writeString(valueLen,value); fixedTypes.writeId(typeInfo); } fixedTypes.close(); fixedTypes.swap(stringTable); }