//--------------------------------------------------------------------------- void dumpFacts(ofstream& out,TempFile& rawFacts,const string& name) // Dump the facts { // Sort the facts TempFile sortedFacts(rawFacts.getBaseFile()); Sorter::sort(rawFacts,sortedFacts,skipTriple,compareTriple,true); // Dump the facts { unlink("facts.sql"); ofstream out("facts.sql"); MemoryMappedFile in; in.open(sortedFacts.getFile().c_str()); const Triple* triplesBegin=reinterpret_cast<const Triple*>(in.getBegin()); const Triple* triplesEnd=reinterpret_cast<const Triple*>(in.getEnd()); for (const Triple* iter=triplesBegin,*limit=triplesEnd; iter!=limit; ++iter) out << (*iter).subject << "\t" << (*iter).predicate << "\t" << (*iter).object << std::endl; } // And write the copy statement out << "drop schema if exists " << name << " cascade;" << endl; out << "create schema " << name << ";" << endl; out << "create table " << name << ".facts(subject int not null, predicate int not null, object int not null);" << endl; out << "copy " << name << ".facts from 'facts.sql';" << endl; // Create indices out << "create index facts_spo on " << name << ".facts (subject, predicate, object);" << endl; out << "create index facts_pso on " << name << ".facts (predicate, subject, object);" << endl; out << "create index facts_pos on " << name << ".facts (predicate, object, subject);" << endl; }
//--------------------------------------------------------------------------- int main(int argc,char* argv[]) { // Check the arguments if (argc<2) { cerr << "usage: " << argv[0] << " <tmp file> " << endl; return 1; } // Parse the input { ofstream out("yago_aftertest.txt"); MemoryMappedFile in; const char* iter,*limit; ensure(in.open(argv[1])); iter=in.getBegin(); limit=in.getEnd(); while (iter != limit){ uint64_t i1, i2, i3; iter = TempFile::readId(iter, i1); iter = TempFile::readId(iter, i2); iter = TempFile::readId(iter, i3); unsigned node = i1; iter = TempFile::readId(iter, i1); iter = TempFile::readId(iter, i2); iter = TempFile::readId(iter, i3); unsigned dir = i2; iter = TempFile::readId(iter, i1); iter = TempFile::readId(iter, i2); iter = TempFile::readId(iter, i3); unsigned selectivity = i3; iter = TempFile::readId(iter, i1); iter = TempFile::readId(iter, i2); iter = TempFile::readId(iter, i3); out<<node<<" "<<dir<<" "<<selectivity<<endl; } } char a = 1000000; cerr<<static_cast<unsigned>(a)<<endl; cout << "Done." << endl; }
Status TripleBitBuilder::resolveTriples(TempFile& rawFacts, TempFile& facts) { cerr<<"Sort by Subject"<<endl; ID subjectID, objectID, predicateID; ID lastSubject = 0, lastObject = 0, lastPredicate = 0; unsigned count0 = 0, count1 = 0; TempFile sortedBySubject("./SortByS"), sortedByObject("./SortByO"); Sorter::sort(rawFacts, sortedBySubject, skipIdIdId, compare123); { //insert into chunk sortedBySubject.close(); MemoryMappedFile mappedIn; assert(mappedIn.open(sortedBySubject.getFile().c_str())); const char* reader = mappedIn.getBegin(), *limit = mappedIn.getEnd(); loadTriple(reader, subjectID, predicateID, objectID); lastSubject = subjectID; lastPredicate = predicateID; lastObject = objectID; reader = skipIdIdId(reader); bool v = generateXY(subjectID, objectID); bitmap->insertTriple(predicateID, subjectID, objectID, v, 0); count0 = count1 = 1; while (reader < limit) { loadTriple(reader, subjectID, predicateID, objectID); if(lastSubject == subjectID && lastPredicate == predicateID && lastObject == objectID) { reader = skipIdIdId(reader); continue; } if ( subjectID != lastSubject ) { ((OneConstantStatisticsBuffer*)statBuffer[0])->addStatis(lastSubject, count0); statBuffer[2]->addStatis(lastSubject, lastPredicate, count1); lastPredicate = predicateID; lastSubject = subjectID; lastObject = objectID; count0 = count1 = 1; } else if ( predicateID != lastPredicate ) { statBuffer[2]->addStatis(lastSubject, lastPredicate, count1); lastPredicate = predicateID; lastObject = objectID; count0++; count1 = 1; }else { count0++; count1++; lastObject = objectID; } reader = reader + 12; v = generateXY(subjectID, objectID); //0 indicate the triple is sorted by subjects' id; bitmap->insertTriple(predicateID, subjectID, objectID, v, 0); } mappedIn.close(); } bitmap->flush(); ((OneConstantStatisticsBuffer*)statBuffer[0])->flush(); ((TwoConstantStatisticsBuffer*)statBuffer[2])->flush(); //sort cerr << "Sort by Object" << endl; Sorter::sort(rawFacts, sortedByObject, skipIdIdId, compare321); { //insert into chunk sortedByObject.close(); MemoryMappedFile mappedIn; assert(mappedIn.open(sortedByObject.getFile().c_str())); const char* reader = mappedIn.getBegin(), *limit = mappedIn.getEnd(); loadTriple(reader, subjectID, predicateID, objectID); lastSubject = subjectID; lastPredicate = predicateID; lastObject = objectID; reader = skipIdIdId(reader); bool v = generateXY(objectID, subjectID); bitmap->insertTriple(predicateID, objectID, subjectID, v, 1); count0 = count1 = 1; while (reader < limit) { loadTriple(reader, subjectID, predicateID, objectID); if(lastSubject == subjectID && lastPredicate == predicateID && lastObject == objectID) { reader = skipIdIdId(reader); continue; } if ( objectID != lastObject ) { ((OneConstantStatisticsBuffer*)statBuffer[1])->addStatis(lastObject, count0); statBuffer[3]->addStatis(lastObject, lastPredicate, count1); lastPredicate = predicateID; lastObject = objectID; lastSubject = subjectID; count0 = count1 = 1; } else if ( predicateID != lastPredicate ) { statBuffer[3]->addStatis(lastObject, lastPredicate, count1); lastPredicate = predicateID; lastSubject = subjectID; count0++; count1 = 1; } else { lastSubject = subjectID; count0++; count1++; } reader = skipIdIdId(reader); v = generateXY(objectID, subjectID); // 1 indicate the triple is sorted by objects' id; bitmap->insertTriple(predicateID, objectID, subjectID, v, 1); } mappedIn.close(); } bitmap->flush(); ((OneConstantStatisticsBuffer*)statBuffer[1])->flush(); ((TwoConstantStatisticsBuffer*)statBuffer[3])->flush(); rawFacts.discard(); sortedByObject.discard(); sortedBySubject.discard(); return OK; }
//--------------------------------------------------------------------------- static void buildDictionary(TempFile& rawStrings,TempFile& stringTable,TempFile& stringIds,map<unsigned,unsigned>& subTypes) // Build the dictionary { cerr << "Building the dictionary..." << endl; // Sort the strings to resolve duplicates TempFile sortedStrings(rawStrings.getBaseFile()); Sorter::sort(rawStrings,sortedStrings,skipStringIdId,compareStringIdId); rawStrings.discard(); // Build the id map and the string list TempFile rawIdMap(rawStrings.getBaseFile()),stringList(rawStrings.getBaseFile()); { MemoryMappedFile strings; ensure(strings.open(sortedStrings.getFile().c_str())); uint64_t lastId=0; unsigned lastLen=0; const char* lastStr=0; uint64_t lastType=0; for (const char* iter=strings.getBegin(),*limit=strings.getEnd();iter!=limit;) { // Read the entry unsigned stringLen; const char* stringStart; iter=TempFile::readString(iter,stringLen,stringStart); uint64_t id,type; iter=TempFile::readId(iter,type); iter=TempFile::readId(iter,id); // A new one? if ((!lastStr)||(stringLen!=lastLen)||(memcmp(lastStr,stringStart,stringLen)!=0)||(type!=lastType)) { stringList.writeId(id); stringList.writeString(stringLen,stringStart); stringList.writeId(type); rawIdMap.writeId(id); rawIdMap.writeId(id); lastId=id; lastLen=stringLen; lastStr=stringStart; lastType=type; } else { rawIdMap.writeId(lastId); rawIdMap.writeId(id); } } } sortedStrings.discard(); // Sort the string list Sorter::sort(stringList,stringTable,skipIdStringId,compareId); stringList.discard(); // Sort the ID map TempFile idMap(rawStrings.getBaseFile()); Sorter::sort(rawIdMap,idMap,skipIdId,compareId); rawIdMap.discard(); // Construct new ids TempFile newIds(rawStrings.getBaseFile()); { MemoryMappedFile in; ensure(in.open(idMap.getFile().c_str())); uint64_t lastId=0,newId=0; for (const char* iter=in.getBegin(),*limit=in.getEnd();iter!=limit;) { uint64_t firstId,currentId; iter=TempFile::readId(iter,firstId); iter=TempFile::readId(iter,currentId); if (firstId!=lastId) { ++newId; lastId=firstId; } newIds.writeId(currentId); newIds.writeId(newId); if (subTypes.count(currentId)) subTypes[currentId]=newId; } } // And a final sort Sorter::sort(newIds,stringIds,skipIdId,compareValue); newIds.discard(); // Resolve the subtypes if necessary if (!subTypes.empty()) { TempFile fixedTypes(rawStrings.getBaseFile()); MemoryMappedFile in; ensure(in.open(stringTable.getFile().c_str())); for (const char* iter=in.getBegin(),*limit=in.getEnd();iter!=limit;) { uint64_t id,typeInfo; const char* value; unsigned valueLen; iter=TempFile::readId(TempFile::readString(TempFile::readId(iter,id),valueLen,value),typeInfo); unsigned type=typeInfo&0xFF,subType=(typeInfo>>8); if (Type::hasSubType(static_cast<Type::ID>(type))) { assert(subTypes.count(subType)); typeInfo=type|(subTypes[subType]<<8); } else { assert(subType==0); } fixedTypes.writeId(id); fixedTypes.writeString(valueLen,value); fixedTypes.writeId(typeInfo); } fixedTypes.close(); fixedTypes.swap(stringTable); }