示例#1
0
void KwayMergeSort::WriteToTempFile(const vector<T> &lineBuffer) {


	TempFile *temp = new TempFile(_tempPath + _inFile);
	// write the contents of the current buffer to the temp file
	for (size_t i = 0; i < lineBuffer.size(); ++i) {

		temp->write(strlen(lineBuffer[i].c_str()), lineBuffer[i].c_str());
		temp->write(1, "\n");
	}

	temp->close();
	_vTempFileNames.push_back(temp->getFile());
	delete temp;

}
//---------------------------------------------------------------------------
static void buildDictionary(TempFile& rawStrings,TempFile& stringTable,TempFile& stringIds,map<unsigned,unsigned>& subTypes)
   // Build the dictionary
{
   cerr << "Building the dictionary..." << endl;

   // Sort the strings to resolve duplicates
   TempFile sortedStrings(rawStrings.getBaseFile());
   Sorter::sort(rawStrings,sortedStrings,skipStringIdId,compareStringIdId);
   rawStrings.discard();

   // Build the id map and the string list
   TempFile rawIdMap(rawStrings.getBaseFile()),stringList(rawStrings.getBaseFile());
   {
      MemoryMappedFile strings;
      ensure(strings.open(sortedStrings.getFile().c_str()));
      uint64_t lastId=0; unsigned lastLen=0; const char* lastStr=0; uint64_t lastType=0;
      for (const char* iter=strings.getBegin(),*limit=strings.getEnd();iter!=limit;) {
         // Read the entry
         unsigned stringLen; const char* stringStart;
         iter=TempFile::readString(iter,stringLen,stringStart);
         uint64_t id,type;
         iter=TempFile::readId(iter,type);
         iter=TempFile::readId(iter,id);

         // A new one?
         if ((!lastStr)||(stringLen!=lastLen)||(memcmp(lastStr,stringStart,stringLen)!=0)||(type!=lastType)) {
            stringList.writeId(id);
            stringList.writeString(stringLen,stringStart);
            stringList.writeId(type);
            rawIdMap.writeId(id);
            rawIdMap.writeId(id);
            lastId=id; lastLen=stringLen; lastStr=stringStart; lastType=type;
         } else {
            rawIdMap.writeId(lastId);
            rawIdMap.writeId(id);
         }
      }
   }
   sortedStrings.discard();

   // Sort the string list
   Sorter::sort(stringList,stringTable,skipIdStringId,compareId);
   stringList.discard();

   // Sort the ID map
   TempFile idMap(rawStrings.getBaseFile());
   Sorter::sort(rawIdMap,idMap,skipIdId,compareId);
   rawIdMap.discard();

   // Construct new ids
   TempFile newIds(rawStrings.getBaseFile());
   {
      MemoryMappedFile in;
      ensure(in.open(idMap.getFile().c_str()));
      uint64_t lastId=0,newId=0;
      for (const char* iter=in.getBegin(),*limit=in.getEnd();iter!=limit;) {
         uint64_t firstId,currentId;
         iter=TempFile::readId(iter,firstId);
         iter=TempFile::readId(iter,currentId);
         if (firstId!=lastId) {
            ++newId;
            lastId=firstId;
         }
         newIds.writeId(currentId);
         newIds.writeId(newId);
         if (subTypes.count(currentId))
            subTypes[currentId]=newId;
      }
   }

   // And a final sort
   Sorter::sort(newIds,stringIds,skipIdId,compareValue);
   newIds.discard();

   // Resolve the subtypes if necessary
   if (!subTypes.empty()) {
      TempFile fixedTypes(rawStrings.getBaseFile());
      MemoryMappedFile in;
      ensure(in.open(stringTable.getFile().c_str()));
      for (const char* iter=in.getBegin(),*limit=in.getEnd();iter!=limit;) {
         uint64_t id,typeInfo;
         const char* value; unsigned valueLen;
         iter=TempFile::readId(TempFile::readString(TempFile::readId(iter,id),valueLen,value),typeInfo);
         unsigned type=typeInfo&0xFF,subType=(typeInfo>>8);
         if (Type::hasSubType(static_cast<Type::ID>(type))) {
            assert(subTypes.count(subType));
            typeInfo=type|(subTypes[subType]<<8);
         } else {
            assert(subType==0);
         }
         fixedTypes.writeId(id);
         fixedTypes.writeString(valueLen,value);
         fixedTypes.writeId(typeInfo);
      }

      fixedTypes.close();
      fixedTypes.swap(stringTable);
   }